1 Setup

1.1 Load libraries

## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'vegan' was built under R version 3.6.3
## Warning: package 'DESeq2' was built under R version 3.6.3
## Warning: package 'phyloseq' was built under R version 3.6.3

1.2 All parameters

1.2.1 File paths

##### If you have M3 google drive and bitbucket repo on your local machine, 
##### you only need to change two parameters below
# M3-shared google drive location
#M3folder_loc_for_ps='/Google Drive/M3-shared/V4/Data/200312_ASVdata8_updateAsteria/ps_notnorm_age_filter_complete_family.rds'

# Loading ps using actual filepath for analysis (To change depending on the user)
ps_not_norm_comp <- readRDS("~/M3_Datasets/ps_not_norm_age_filter_complete_family.rds")

## Output data location (subject to change)
#output_data=paste0(M3folder_loc, 'Data/V4/180808_ASVdata4/OutputData_Agefiltered/')
output_data <- "~/M3_Datasets/"

1.2.2 Cutoff parameters

# min post DADA2 counts to run analysis
min_postDD=20000
# DESeq significant cutoff
deseq_cut=0.05
# metagenomeSeq significant cutoff
mtgseq_cut=0.05
# chisquare test cutoff (for diet questionnare results significance)
chisq_cut=0.05
# PERMANOVA pvalue and R2 cutoff for visualization
permanova_pcut=0.05
permanova_cut=0.1

2 Summarize metadata

# chisquared test function
run_chisq_test <- function(ps, metacol){
  # ps: phyloseq object
  # metacol: metadata column name to test
  metaDF <- data.frame(sample_data(ps))
  # remove NAs, for some reason, some NA are recorded as a text!
  submetaDF=metaDF[!is.na(metaDF[, metacol]), ]
  submetaDF=submetaDF[submetaDF[, metacol]!='NA', ]
  submetaDF=submetaDF[submetaDF[, metacol]!='', ]   # also remove blank
  # chisquared test
  chisq_res=chisq.test(table(submetaDF[, metacol], submetaDF[, 'phenotype']))
  # extract results
  resDT=data.table(chisq_res$observed)
  # dcast for printing
  resDT <- data.table(dcast(resDT, V1 ~ V2, value.var='N'))
  resDT <- resDT[, testvar:=metacol]
  resDT <- resDT[, chisq_p:=chisq_res$p.value]
  
  return(resDT[, list(testvar, category=V1, A, N, chisq_p)])
}

# composition plot function
plot_composition <- function(chisq_resDT, var_name){
  # chisq_resDT: 4 columns. testvar, category, A, N, chisq_p
  plotDT=melt(chisq_resDT, id.vars=c('testvar', 'category', 'chisq_p'))
  p=ggplot(data=plotDT[testvar==var_name], aes(x=variable, y=value, fill=category))+
    geom_bar(stat="identity")+
    xlab('')+ylab('Number of sample')+
    ggtitle(var_name)+
    theme_minimal()+
    theme(legend.title=element_blank(), legend.position="bottom", axis.text.x=element_text(vjust=1, hjust=1))+
    scale_fill_manual(values=sgColorPalette)
  print(p)
}

2.1 Demographics

2.2 fix metadata and remove samples that are too young or have diagnostic discrepancies

#Remove Breastfed and their families
breast_fed <- c("089_A","054_N", "158_N" )

b_fed<-unique(ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Host.Name %in% breast_fed)])

#remove the whole family
for (i in b_fed){
  ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != i, ps_not_norm_comp)

}

#fixing the mapping file for stats by adding categorical vs non catergorical
metadata_ok<-sample_data(ps_not_norm_comp)
write.csv(metadata_ok, "sam_data.csv")
map<-read.csv("sam_data.csv")
meta_cat <- read.csv("updated_metacategories.csv")
for (i in meta_cat$varname.16S.V4[which(meta_cat$permanova != FALSE)]){
 if (meta_cat$permanova[which(meta_cat$varname.16S.V4 == i)] == "Categorical") {
    map[,i] <- as.factor(map[,i])
 } else {
    map[,i] <- as.numeric(map[,i])
 }
}


makeFieldsNumeric <- function(map){
  handleNAs <- function(vec){
    vec[vec == ""] <- "NA"
    vec[is.na(vec)] <- "NA"
    return(vec)
  }
  
  map$Stool.frequency <- handleNAs(as.character(map$Stool.frequency))
  map$Stool.frequency[as.character(map$Stool.frequency) == "Less than 1"] = 0
  map$Stool.frequency[as.character(map$Stool.frequency) == "5 or more"] = 5
  map$Dairy..consumption.frequency...longitudinal.[map$Dairy..consumption.frequency...longitudinal. == 5] <- "3-4 meals per week"
  #map$LR2[map$LR2 == "1 (ASD)"] = 1
  #map$LR2[map$LR2 == "0 (non-ASD"] = 0
  
  
  freq_dict_2 <- list("Never" = 0, "Rarely" = 1, "Occasionally" = 2, "Regularly" = 3, "Weekly" = 4, "weekly" = 4,
                      "Several time weekly" = 5, "Several times weekly" = 5, "Daily" = 6, "NA" = NA)
  dict_2_items <- c("Whole.grain..consumption.frequency.", "Fermented.vegetable..consumption.frequency.", "Dairy..consumption.frequency.",                     "Fruit..consumption.frequency.", "Meals.prepared.at.home..consumption.frequency.",   "Ready.to.eat.meals..consumption.frequency.", "Red.meat..consumption.frequency.", "Olive.oil.used.in.cooking..M3.", "Seafood..consumption.frequency.",   "Sweetened.drink..consumption.frequency.", "Vegetable..consumption.frequency.",
                    "Restaurant.prepared.meals..consumption.frequency.", "Sugary.food..consumption.frequency.", "Probiotic..consumption.frequency.", "Vitamin.B.complex.supplement..consumption.frequency.", "Vitamin.D..consumption.frequency.")
  for(item in dict_2_items){
    print(item)
    tmp <- rep(NA, nrow(map))
    freqs <- handleNAs(map[,item])
    numeric_rep <- unlist(freq_dict_2[freqs])
    print(paste("Numeric rep length: ", length(numeric_rep)))
    print(sum(!is.na(freqs)))
    tmp[!is.na(freqs)] <- as.numeric(numeric_rep)  
    map[ , item] <- tmp
  }
  
  
  freq_dict_1 <- list("Never or less than once per week" = 0, "3-4 meals per week" = 1, "5" = 2, "7-10 meals per week" = 3, "Almost every meal" = 4, "NA" = NA)
  dict_1_items <- c("Starchy.food..consumption.frequency...longitudinal.", "Meats.and.seafood..consumption.frequency...longitudinal.", "Bread..consumption.frequency...longitudinal.", "Dairy..consumption.frequency...longitudinal.", "Dietary.fat.and.oil..consumption.frequency...longitudinal.", "Vegetable..consumption.frequency...longitudinal.", 
                    "Fruit..consumption.frequency...longitudinal.")
  for(item in dict_1_items){
    print(item)
    tmp <- rep(NA, nrow(map))
    freqs <- handleNAs(map[ , item])
    numeric_rep <- unlist(freq_dict_1[freqs])
    print(paste("Numeric rep length: ", length(numeric_rep)))
    print(sum(!is.na(freqs)))
    tmp[!is.na(freqs)] <- as.numeric(numeric_rep)  
    map[ , item] <- tmp
  }
  
  #may add more, but these variable only apply to phenotype for autism
    freq_dict_2 <- list("Able to speak fluently" = 3,"Phrase speech"=2, "Single word speech"=1, "Little to no speech" = 0, "Able to have conversation" = 3, "Limited conversation ability" = 2, "Difficulty with conversation" = 1, "Cannot have a conversation" = 0,"Understands about half of words" = 1, "Understands few or no words"= 0, "Understands many words" = 2,  "Understands most words"= 3, "Understands nearly all words" = 4 ,"NA" = NA)
  dict_2_items <- c("Language.ability.and.use", "Conversation.ability", "Understands.speech")
  for(item in dict_2_items){
    print(item)
    tmp <- rep(NA, nrow(map))
    freqs <- handleNAs(map[ , item])
    numeric_rep <- unlist(freq_dict_2[freqs])
    print(paste("Numeric rep length: ", length(numeric_rep)))
    print(sum(!is.na(freqs)))
    tmp[!is.na(freqs)] <- as.numeric(numeric_rep)  
    map[ , item] <- tmp
  }
  
  
  map <- map[!duplicated(map$Biospecimen.Barcode), ]
  rownames(map) <- map$Biospecimen.Barcode
  map$Stool.frequency <- as.numeric(map$Stool.frequency)
  return(map)

}

dict_1_items <- c("Starchy.food..consumption.frequency...longitudinal.", "Meats.and.seafood..consumption.frequency...longitudinal.", "Bread..consumption.frequency...longitudinal.", "Dairy..consumption.frequency...longitudinal.", "Dietary.fat.and.oil..consumption.frequency...longitudinal.", "Vegetable..consumption.frequency...longitudinal.", 
                    "Fruit..consumption.frequency...longitudinal.")
dict_2_items <- c("Language.ability.and.use", "Conversation.ability", "Understands.speech")


map<-makeFieldsNumeric(map)
## [1] "Whole.grain..consumption.frequency."
## [1] "Numeric rep length:  447"
## [1] 447
## [1] "Fermented.vegetable..consumption.frequency."
## [1] "Numeric rep length:  450"
## [1] 450
## [1] "Dairy..consumption.frequency."
## [1] "Numeric rep length:  450"
## [1] 450
## [1] "Fruit..consumption.frequency."
## [1] "Numeric rep length:  450"
## [1] 450
## [1] "Meals.prepared.at.home..consumption.frequency."
## [1] "Numeric rep length:  450"
## [1] 450
## [1] "Ready.to.eat.meals..consumption.frequency."
## [1] "Numeric rep length:  450"
## [1] 450
## [1] "Red.meat..consumption.frequency."
## [1] "Numeric rep length:  78"
## [1] 78
## [1] "Olive.oil.used.in.cooking..M3."
## [1] "Numeric rep length:  447"
## [1] 447
## [1] "Seafood..consumption.frequency."
## [1] "Numeric rep length:  447"
## [1] 447
## [1] "Sweetened.drink..consumption.frequency."
## [1] "Numeric rep length:  447"
## [1] 447
## [1] "Vegetable..consumption.frequency."
## [1] "Numeric rep length:  450"
## [1] 450
## [1] "Restaurant.prepared.meals..consumption.frequency."
## [1] "Numeric rep length:  447"
## [1] 447
## [1] "Sugary.food..consumption.frequency."
## [1] "Numeric rep length:  450"
## [1] 450
## [1] "Probiotic..consumption.frequency."
## [1] "Numeric rep length:  450"
## [1] 450
## [1] "Vitamin.B.complex.supplement..consumption.frequency."
## [1] "Numeric rep length:  450"
## [1] 450
## [1] "Vitamin.D..consumption.frequency."
## [1] "Numeric rep length:  450"
## [1] 450
## [1] "Starchy.food..consumption.frequency...longitudinal."
## [1] "Numeric rep length:  421"
## [1] 421
## [1] "Meats.and.seafood..consumption.frequency...longitudinal."
## [1] "Numeric rep length:  421"
## [1] 421
## [1] "Bread..consumption.frequency...longitudinal."
## [1] "Numeric rep length:  421"
## [1] 421
## [1] "Dairy..consumption.frequency...longitudinal."
## [1] "Numeric rep length:  421"
## [1] 421
## [1] "Dietary.fat.and.oil..consumption.frequency...longitudinal."
## [1] "Numeric rep length:  421"
## [1] 421
## [1] "Vegetable..consumption.frequency...longitudinal."
## [1] "Numeric rep length:  421"
## [1] 421
## [1] "Fruit..consumption.frequency...longitudinal."
## [1] "Numeric rep length:  421"
## [1] 421
## [1] "Language.ability.and.use"
## [1] "Numeric rep length:  225"
## [1] 225
## [1] "Conversation.ability"
## [1] "Numeric rep length:  225"
## [1] 225
## [1] "Understands.speech"
## [1] "Numeric rep length:  225"
## [1] 225
map_levels<-sapply(map, levels)
map_levelscount<-sapply(map_levels, length)
mapnotfac <- names(map_levelscount[which(map_levelscount >= 18)])

for (i in mapnotfac){
  map[,i]<-as.character(map[,i])
}

#Round years
map$Age..years. <-round(map$Age..years.)
sample_data(ps_not_norm_comp) <- map

#remove samples that are too young and their paired sibling
tooyoung <-ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Age..months. < 24)]

#family 65 should be removed according to >24 months criteria
ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != unique(tooyoung), ps_not_norm_comp)

#List of individuals that were reported w/ autism, but was not classified as such through MARA and/or video classifier
pheno_contrad <-read.csv("phenotype_contradictions.csv")
contradicting<-unique(ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Host.Name %in% as.character(pheno_contrad$host_name))])

#remove the whole family
for (i in contradicting){
  ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != i, ps_not_norm_comp)

}

#round off year
ps_not_norm_comp@sam_data$Age..years.<-round(ps_not_norm_comp@sam_data$Age..years.)

metadata_ok<-ps_not_norm_comp@sam_data

2.2.1 Biological.sex

#now let's only run the categorical values for chi square and remove the first column which are not metadata 
num_cat<-names(Filter(is.numeric, metadata_ok))
fac_cat<-names(Filter(is.factor, metadata_ok))
#removing the first 13 columns, since it's not metadata and the last one which is phenotype  
fac_cat<-fac_cat[-c(1:13, length(fac_cat))]
#finally remiving the ones that were only asked for the children with ASD, or only have one factor & NA, or only present in one phen
fac_cat<-fac_cat[-which(fac_cat %in% c("Behavior.video.submitted..M3.","Language.ability.and.use","Conversation.ability","Understands.speech","Plays.imaginatively.when.alone","Plays.imaginatively.with.others","Plays.in.a.group.with.others","Eye.contact.finding","Childhood.behavioral.development.finding","Repetitive.motion","Picks.up.objects.to.show.to.others","Sleep.pattern.finding","Response.to.typical.sounds","Self.injurious.behavior.finding","Gastrointestinal.problems..M3.", "Imitation.behavior", "Other.stool.sample.collection.method.explained..M3.", "Flu.shot.in.the.last..MFlu.shot.in.the.last..M3.", "Pica.disease", "Additional.info.affecting.microbiome..M3.", "Dietary.restrictions.details..M3.", "Pet.bird"))]



#Also remove the ones with only one factor (no chi-square possible)
#now running the chisquare on all categorical values 
chisquare_p.val=c()
names_chisquare_p.val=c()
all_chisquare=list()
chi_list<-names(map_levelscount)[map_levelscount > 1]
chi_list <-chi_list[-c(1:9)]
chi_list<-chi_list[-which(chi_list %in% c("Behavior.video.submitted..M3.","Language.ability.and.use","Conversation.ability","Understands.speech","Plays.imaginatively.when.alone","Plays.imaginatively.with.others","Plays.in.a.group.with.others","Eye.contact.finding","Childhood.behavioral.development.finding","Repetitive.motion","Picks.up.objects.to.show.to.others","Sleep.pattern.finding","Response.to.typical.sounds","Self.injurious.behavior.finding","Gastrointestinal.problems..M3.", "Imitation.behavior", "Other.stool.sample.collection.method.explained..M3.", "Flu.shot.in.the.last..MFlu.shot.in.the.last..M3.", "Pica.disease", "Additional.info.affecting.microbiome..M3.", "Dietary.restrictions.details..M3.", "Pet.bird", "Host.disease.status", "phenotype"))]

map_num<-sapply(map, is.numeric)
num_cat <- colnames(map[,as.vector(which(map_num == TRUE))])
num_cat <- num_cat[-c(1:4)]
num_cat <- num_cat[-which(num_cat %in% c("Language.ability.and.use", "Conversation.ability", "Understands.speech", "Mobile.Autism.Risk.Assessment.Score", "Number.of.small.pet.herbivores", "Number.of.small.pet.rodents", "phenotype_num", "Number.of.pet.birds"))]

#Run chi on all
chi_list <- c(chi_list, num_cat)

for (i in 1:length(chi_list)){
    tmp<-run_chisq_test(ps_not_norm_comp, chi_list[i])
    chisquare_p.val<-c(chisquare_p.val,min(tmp$chisq_p))
    names_chisquare_p.val<-c(names_chisquare_p.val,chi_list[i])
    all_chisquare[[i]]<-tmp
}
names(chisquare_p.val)<-names_chisquare_p.val
names(all_chisquare) <-chi_list

# p-value correction
chisquare_p.val<-p.adjust(chisquare_p.val)
chisquare_p.val<-chisquare_p.val[chisquare_p.val < 0.05]

length(chisquare_p.val) #41
## [1] 37
chisquare_p.val
##                             Functional.bowel.finding 
##                                         1.789615e-07 
##                                       Dietary.regime 
##                                         9.029215e-03 
##                     GI.symptoms.within.3.months..M3. 
##                                         4.521062e-14 
##                                       Biological.sex 
##                                         3.976313e-07 
##                             GI.issues.this.week..M3. 
##                                         3.552499e-10 
##                        Non.celiac.gluten.sensitivity 
##                                         1.134098e-04 
##                                  Lactose.intolerance 
##                                         1.321945e-02 
##                            Dietary.restrictions..M3. 
##                                         2.003499e-07 
##                                   Dietary.supplement 
##                                         1.915985e-08 
##                                  LR6.prediction..M3. 
##                                         6.539853e-15 
##                                 LR10.prediction..M3. 
##                                         2.079463e-11 
##                                  LR5.prediction..M3. 
##                                         7.924848e-16 
##                                       Toilet.trained 
##                                         1.540770e-02 
##                        Other.symptoms.this.week..M3. 
##                                         3.743232e-02 
##                  Recent.anxiety..caretaker.reported. 
##                                         1.106379e-03 
##                                    Toilet.cover..M3. 
##                                         1.530253e-03 
##                 Most.recent.GI.episode.symptoms..M3. 
##                                         2.423190e-04 
##                                      Stool.frequency 
##                                         4.117528e-04 
##                        Dairy..consumption.frequency. 
##                                         9.851763e-07 
##                        Fruit..consumption.frequency. 
##                                         2.390145e-08 
##       Meals.prepared.at.home..consumption.frequency. 
##                                         1.846027e-02 
##                    Vegetable..consumption.frequency. 
##                                         2.685047e-06 
##                                         Age..months. 
##                                         5.262075e-15 
##                    Probiotic..consumption.frequency. 
##                                         3.155060e-08 
## Vitamin.B.complex.supplement..consumption.frequency. 
##                                         2.663542e-08 
##                    Vitamin.D..consumption.frequency. 
##                                         2.765978e-08 
##                         LR6.probability.not.ASD..M3. 
##                                         2.156944e-10 
##                             LR6.probability.ASD..M3. 
##                                         2.156944e-10 
##                        LR10.probability.not.ASD..M3. 
##                                         2.156944e-10 
##                            LR10.probability.ASD..M3. 
##                                         2.156944e-10 
##                         LR5.probability.not.ASD..M3. 
##                                         1.783339e-12 
##                             LR5.probability.ASD..M3. 
##                                         1.783339e-12 
##                                          Age..years. 
##                                         1.293926e-14 
##         Bread..consumption.frequency...longitudinal. 
##                                         4.062774e-04 
##         Dairy..consumption.frequency...longitudinal. 
##                                         1.815090e-05 
##     Vegetable..consumption.frequency...longitudinal. 
##                                         4.698634e-05 
##         Fruit..consumption.frequency...longitudinal. 
##                                         1.134098e-04
#vizualisation of the results 
#select only the signififcant ones
all_chisquare<-all_chisquare[names(all_chisquare) %in% names(chisquare_p.val)]
#save this into a csv
write.csv(format(chisquare_p.val, digits=2), file=paste0(output_data,"Xsqr_05.csv"), quote=F)
# plot one example out of 29 
plot_composition(all_chisquare[1], names(all_chisquare)[1])

2.2.2 Racial.group

# print number table
table(sample_data(ps_not_norm_comp)$Racial.group, sample_data(ps_not_norm_comp)$Biological.sex)
##                                         
##                                          Female Male
##   Asian race                                 12   30
##   Asian race & Middle Eastern race            3    3
##   Asian race & Unknown racial group           3    3
##   Caucasian                                  81  165
##   Caucasian & Hispanic                        0   12
##   Caucasian & Indian (subcontinent) race      0    6
##   Hispanic                                    6   18
##   Hispanic & African race                     6    0
##   Indian (subcontinent) race                  3    3
##   Indian race                                 6    6
##   Unknown racial group                        0   12
# run chisquared test
race=run_chisq_test(ps_not_norm_comp, 'Racial.group')
# print results
pander(race)
testvar category A N chisq_p
Racial.group Asian race 21 21 1
Racial.group Asian race & Middle Eastern race 3 3 1
Racial.group Asian race & Unknown racial group 3 3 1
Racial.group Caucasian 123 123 1
Racial.group Caucasian & Hispanic 6 6 1
Racial.group Caucasian & Indian (subcontinent) race 3 3 1
Racial.group Hispanic 12 12 1
Racial.group Hispanic & African race 3 3 1
Racial.group Indian (subcontinent) race 3 3 1
Racial.group Indian race 6 6 1
Racial.group Unknown racial group 6 6 1
# plot
plot_composition(race, 'Racial.group')

# % table
race_prop=prop.table(as.matrix(race[, .(A, N)]), margin=2)*100
row.names(race_prop) <- race$category
pander(race_prop)
  A N
Asian race 11.11 11.11
Asian race & Middle Eastern race 1.587 1.587
Asian race & Unknown racial group 1.587 1.587
Caucasian 65.08 65.08
Caucasian & Hispanic 3.175 3.175
Caucasian & Indian (subcontinent) race 1.587 1.587
Hispanic 6.349 6.349
Hispanic & African race 1.587 1.587
Indian (subcontinent) race 1.587 1.587
Indian race 3.175 3.175
Unknown racial group 3.175 3.175
# write
write.csv(race_prop, file=paste0(output_data, 'Race.csv'))

2.2.3 Age..months.

# make sure it is numeric
sample_data(ps_not_norm_comp)$Age..months. <- as.numeric(sample_data(ps_not_norm_comp)$Age..months.)

# plot
ggplot(data=sample_data(ps_not_norm_comp), aes(x=phenotype, y=Age..months., fill=phenotype))+
  geom_boxplot(width=0.7, outlier.colour='white')+
  geom_jitter(size=1, position=position_jitter(width=0.1))+
  xlab('')+ylab('Age (months)')+
  scale_fill_manual(values=sgColorPalette)+
  theme_minimal()

# run tests to check significance
shapiro.test(sample_data(ps_not_norm_comp)$Age..months.) #not normal we need a reanking test
## 
##  Shapiro-Wilk normality test
## 
## data:  sample_data(ps_not_norm_comp)$Age..months.
## W = 0.96494, p-value = 7.196e-08
wilcox.test(Age..months. ~ phenotype, data=data.frame(sample_data(ps_not_norm_comp)), var.equal=FALSE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Age..months. by phenotype
## W = 20889, p-value = 0.004351
## alternative hypothesis: true location shift is not equal to 0
#Let's generalized all the values with numeric input
wilcox_pval=c()

for (i in 1:length(num_cat)){
  #if (levels(get(num_cat[i], metadata_ok)) >= 2)
  tmp<-wilcox.test(get(num_cat[i]) ~ phenotype, data=map, var.equal=FALSE)
  wilcox_pval<-c(wilcox_pval,tmp$p.value)
}
names(wilcox_pval)<-num_cat
#correction 
wilcox_pval<-p.adjust(wilcox_pval)
wilcox_pval[wilcox_pval<0.05]  #LRprobabilities , Veget, bread, dairy fruit, sweet drink
##                  Seafood..consumption.frequency. 
##                                     3.362736e-02 
##                Vegetable..consumption.frequency. 
##                                     3.153965e-02 
##                                     Age..months. 
##                                     9.539155e-03 
##                     LR6.probability.not.ASD..M3. 
##                                     5.653878e-20 
##                         LR6.probability.ASD..M3. 
##                                     5.653878e-20 
##                    LR10.probability.not.ASD..M3. 
##                                     4.598240e-22 
##                        LR10.probability.ASD..M3. 
##                                     4.598240e-22 
##                     LR5.probability.not.ASD..M3. 
##                                     9.546453e-19 
##                         LR5.probability.ASD..M3. 
##                                     9.546453e-19 
##                                      Age..years. 
##                                     5.614868e-03 
##     Bread..consumption.frequency...longitudinal. 
##                                     8.026472e-05 
##     Dairy..consumption.frequency...longitudinal. 
##                                     2.591744e-05 
## Vegetable..consumption.frequency...longitudinal. 
##                                     3.184748e-02 
##     Fruit..consumption.frequency...longitudinal. 
##                                     3.043812e-03

2.3 Dietary variance

Dietary variance amongst ASD patients will also be assessed. Based on preliminary analyses, we expect that ASD participants, collectively, will exhibit a minimal degree of dietary variance.

# read metadata category file
#meta_cat=fread(paste0(M3folder_loc, meta_cat_fp), header=TRUE)

#Created this csv from the file in the shared M3 google drive, took the first sheet and removed the last incomplete column, then converted to csv
meta_cat<-read.csv("updated_metacategories.csv")
colnames(meta_cat)[1] <- "varname"

# list of diet questions
diet_info<-metadata_ok[,colnames(metadata_ok) %in% meta_cat$varname[which(meta_cat$diet==TRUE)]]
#additionnal error to remove: filled with only NA or one factor, cant do chisquare on one factor
diet_col_levels<-sapply(diet_info, levels)
dietcol_levelscount<-sapply(diet_col_levels, length)

#Since there numerics dietary variables are only two and filled primarily with NAs (as shown by line 248, we will omit)
#diet_info[,which(sapply(diet_info, class) == "numeric")]
diet_info <- diet_info[,which(dietcol_levelscount >= 2)]

#dietq_col <-which(colnames(sample_data(ps_not_norm_comp)) %in% colnames(diet_info))
dietq_col <- colnames(diet_info)

# for each variable, summarize and check if A vs N different? (we hypothesized variance in ASD are the same as NT?)
master_resDT=NULL
for(i in dietq_col){
  resDT=run_chisq_test(ps_not_norm_comp, i)
# add to master
  master_resDT <- rbindlist(list(master_resDT, resDT))
}

# variables tested
unique(master_resDT$testvar)
## [1] "Dietary.regime"               "Meat..consumption.frequency."
## [3] "Multivitamin"                 "Dietary.supplement"
# order by significance
master_resDT <- master_resDT[order(chisq_p)]

# print table
datatable(master_resDT)
# write csv file
write.csv(master_resDT, file=paste0(output_data, 'Dietary_var_all_proportion.csv'))
write.csv(unique(master_resDT[, list(testvar, chisq_p)]), file=paste0(output_data, 'Dietary_var_chisq_test.csv'))

# plot top 3 most significant vars
plot_diet=master_resDT[testvar %in% unique(master_resDT[, list(testvar, chisq_p)])$testvar[1:3]]
for(i in unique(plot_diet$testvar)){
  plot_composition(plot_diet, i)
}

3 Normalization

dir.create(paste0(output_data, 'Normalized/'))

#Filtering of the prevalence: 
###Declare function to filter
filterTaxaByPrevolence <- function(ps, percentSamplesPresentIn){
  prevalenceThreshold <- percentSamplesPresentIn * nsamples(ps)
  toKeep <- apply(data.frame(otu_table(ps)), 1, function(taxa) return(sum(taxa > 0) > prevalenceThreshold))
  ps_filt <- prune_taxa(toKeep, ps)
  return(ps_filt)
}

#CSS norm function
#We actually will plot everything with CSS 
CSS_norm<-function(ps){
  ps.metaG<-phyloseq_to_metagenomeSeq(ps)
  p_stat = cumNormStatFast(ps.metaG)
  ps.metaG = cumNorm(ps.metaG, p = p_stat)
  ps.metaG.norm <- MRcounts(ps.metaG, norm = T)
  ps_CSS<-phyloseq(otu_table(ps.metaG.norm, taxa_are_rows = T), sample_data(ps),tax_table(ps))
  return(ps_CSS)
}

#Deseq norm 
deSeqNorm <- function(ps){
ps_dds <- phyloseq_to_deseq2(ps, ~ phenotype)
ps_dds <- estimateSizeFactors(ps_dds, type = "poscounts")
ps_dds <- estimateDispersions(ps_dds)
abund <- getVarianceStabilizedData(ps_dds)
abund <- abund + abs(min(abund)) #don't allow deseq to return negative counts
ps_deSeq <- phyloseq(otu_table(abund, taxa_are_rows = T), sample_data(ps), tax_table(ps))
return(ps_deSeq)
}

#Now we remove the taxa present in less than 3 % of the samples with some basic filtering 
filtered_ps003<-filterTaxaByPrevolence(ps_not_norm_comp, 0.03)
filtered_ps003
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 858 taxa and 378 samples ]
## sample_data() Sample Data:       [ 378 samples by 365 sample variables ]
## tax_table()   Taxonomy Table:    [ 858 taxa by 8 taxonomic ranks ]
saveRDS(filtered_ps003, file=paste0(output_data, "Normalized/ps_not_norm_comp_pass_min_postDD_min0.03.Rda"))

# CSS normalization
ps_CSS_norm_pass_min_postDD_sup003<-CSS_norm(filtered_ps003)
saveRDS(ps_CSS_norm_pass_min_postDD_sup003, file=paste0(output_data, "Normalized/ps_CSS_pass_min_postDD_min0.03.Rda"))

# DESeq normalization
ps_DeSeq_norm_pass_min_postDD_sup003<-deSeqNorm(filtered_ps003)
saveRDS(ps_DeSeq_norm_pass_min_postDD_sup003, file=paste0(output_data, "Normalized/ps_DeSeq_pass_min_postDD_min0.03.Rda"))

# TSS normalization
propDF=prop.table(as.matrix(otu_table(filtered_ps003)), margin=2)
ps_TSS_norm_pass_min_postDD_sup003 <- phyloseq(otu_table(propDF, taxa_are_rows=TRUE), 
                                               tax_table(filtered_ps003), 
                                               sample_data(filtered_ps003))

3.0.1 Repeated measures ANOVA to find ASVs that have means that change significantly between timepoints

#format asv table with timepoint + hostname info
asv_table<-t(otu_table(ps_DeSeq_norm_pass_min_postDD_sup003))
asv_table <- as.data.frame(asv_table)
asv_table$timepoint <- ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Within.study.sampling.date
asv_table$timepoint <- as.factor(asv_table$timepoint)
asv_table$Host.Name <- ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Host.Name

#Create initial table of first asv to build off of
tmp <-summary(aov(asv_table[,1] ~ timepoint + Error(Host.Name/timepoint), data = asv_table))  
p_val <-tmp$`Error: Host.Name:timepoint`[[1]]$`Pr(>F)`[1]
anova_asv_res <- tibble(colnames(asv_table[1]), p_val)
colnames(anova_asv_res) <- c("ASV", "p_val")

#Run for each asv
for (i in 2:(length(colnames(asv_table))-2)) {
  form <-as.formula(paste0("asv_table[," , i, "]", " ~ timepoint + Error(Host.Name/timepoint)"))
  tmp <-summary(aov(formula = form, data = asv_table))  
  p_val <-tmp$`Error: Host.Name:timepoint`[[1]]$`Pr(>F)`[1]
  tmpres <- tibble(colnames(asv_table[i]), p_val)
  colnames(tmpres) <- c("ASV", "p_val")
  anova_asv_res<- rbind(anova_asv_res, tmpres)
}

#find sig ones btwn timepoints
asv_sig_btwn_timep_DES<-anova_asv_res$ASV[which(anova_asv_res$p_val <= 0.05)]

asv_sig_btwn_timep_DES
##  [1] "GCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCGCGTAGGTGGTTCAGCAAGTTGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATCCAAAACTACTGAGCTAGAGTACGGTAGAGGGTGGTGGAATTTCCTGTGTAGCGGTGAAATGCGTAGATATAGGAAGGAACACCAGTGGCGAAGGCGACCACCTGGACTGATACTGACACTGAGGTGCGAAAGCGTGGGG"
##  [2] "GCAAGCGTTATCCGGAATCATTGGGCGTAAAGGGTGCGTAGGTGGCGTACTAAGTCTGTAGTAAAAGGCAATGGCTCAACCATTGTAAGCTATGGAAACTGGTATGCTAGAGTGCAGAAGAGGGCGATGGAATTCCATGTGTAGCGGTAAAATGCGTAGATATATGGAGGAACACCAGTGGCGAAGGCGGTCGCCTGGTCTGTAACTGACACTGAGGCACGAAAGCGTGGGG" 
##  [3] "GCAAGCGTTATCCGGATTTACTGGGCGTAAAGGGAGCGTAGGCGGATATTTAAGTGGGATGTGAAATACCTGAGCTTAACTTGGGAGCTGCATTCCAAACTGGATATCTAGAGTGCAGGAGAGGAGAATGGAATTCCTAGTGTAGCGGTGAAATGCGTAGAGATTAGGAAGAACACCAGTGGCGAAGGCGATTCTCTGGACTGTAACTGACGCTGAGGCTCGAAAGCGTGGGG"
##  [4] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGCATGATAAGTCTGATGTGAAAACCCAAGGCTCAACCATGGGACTGCATTGGAAACTGTCGTGCTAGAGTGTCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATGACTGACGCTGAGGCTCGAAAGCGTGGGG"
##  [5] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGGACTGCAAGTTGGATGTGAAATACCGTGGCTTAACCACGGAACTGCATCCAAAACTGTAGTTCTTGAGTGAAGTAGAGGCAAGCGGAATTCCGAGTGTAGCGGTGAAATGCGTAGATATTCGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGGCTTTAACTGACGCTGAGGCTCGAAAGTGTGGGG"
##  [6] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTCTGGCAAGTCTGATGTGAAAATCCGGGGCTCAACTCCGGAACTGCATTGGAAACTGTCAGACTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTGGGG"
##  [7] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCAGTGCAAGTCTGAAGTGAAAGCCTGGGGCTCAACCCCGGAACTGCTTTGGAAACTGTGCTGCTTGAGTACCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
##  [8] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCAGTGCAAGTCTGAAGTGAAAGGCTGGGGCTCAACCCCGGAACTGCTTTGGAAACTGTGCTGCTTGAGTACCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
##  [9] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGATGCAAGTCTGGAGTGAAAGCCCAGGGCTCAACCCTGGGACTGCTTTGGAAACTGTATGGCTAGAGTGCTGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACAGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
## [10] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGTATGGCAAGTCTGATGTGAAAGGCCGGGGCTCAACCCCGGGACTGCATTGGAAACTGCCAGACTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGACAACTGACGCTGAGGCTCGAAAGCGTGGGG"
## [11] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGTTATGCAAGTCCGATGTGAAAGCCCGGGGCTTAACCCCGGGACTGCATTAGAAACTGTGTAACTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTGGGG"
## [12] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGTATGGCAAGTCAGAAGTGAAAGGCTGGGGCTCAACCCCGGGACTGCTTTTGAAACTGTCAAACTAGAGTACAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGAAACTGACACTGAGGCACGAAAGCGTGGGG"
## [13] "GCAAGCGTTATCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCTCGGCTTAACCGAGGAAGTGCATCGGAAACTGGGAAACTTGAGTACAGAAGAGGACAGTGGAACTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTGTCTGGTCTGTAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [14] "GCAAGCGTTATCCGGATTTATTGGGTGTAAAGGGTGCGTAGACGGGAAGGTAAGTTAGTTGTGAAATCCCTCGGCTCAACTGAGGAACTGCGACTAAAACTGCTTTTCTTGAGTGCTGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGACAGCAACTGACGTTGAGGCACGAAAGTGTGGGG"
## [15] "GCAAGCGTTGTCCGGAATAATTGGGCGTAAAGGGCGCGTAGGCGGCTCGGTAAGTCTGGAGTGAAAGTCCTGCTTTTAAGGTGGGAATTGCTTTGGATACTGTCGGGCTTGAGTGCAGGAGAGGTTAGTGGAATTCCCAGTGTAGCGGTGAAATGCGTAGAGATTGGGAGGAACACCAGTGGCGAAGGCGACTAACTGGACTGTAACTGACGCTGAGGCGCGAAAGTGTGGGG"
## [16] "GCAAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGCCGTTTAAGTCAGATGTGAAATACCCGTGCTTAACATGGGGGCTGCATTTGAAACTGGATGGCTTGAGTGCAGGAGAGGCAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTGCTGGACTGTAACTGACGCTGAGGCACGAAAGCGTGGGG"
## [17] "GCAAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGCGCGTTAAGTCAGATGTGAAATACCCGTGCTTAACATGGGGGGTGCATCTGATACTGGCGTGCTTGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGTAACTGACGCTGAGGCACGAAAGCGTGGGG"
## [18] "GCAAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGTGGCCATGTAAGTCAGGTGTGAAAGACCGGGGCTCAACCCCGGGGTTGCACTTGAAACTGTGTGGCTTGAGTACAGGAGAGGGAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTTCTGGACTGTAACTGACACTGAGGCACGAAAGCGTGGGG"
## [19] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAGAGCAAGTTGGAAGTGAAATCTGTGGGCTCAACTCACAAATTGCTTTCAAAACTGTTTTTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [20] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGTGATCAAGTCAGCTGTGAAAACTACGGGCTTAACCCGTAGACTGCAGTTGAAACTGTTCATCTTGAGTGAAGTAGAGGTTGGCGGAATTCCGAGTGTAGCGGTGAAATGCGTAGATATTCGGAGGAACACCGGTGGCGAAGGCGGCCAACTGGGCTTTAACTGACGCTGAGGCTCGAAAGTGTGGGG"
## [21] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGGAGGCAAGTTGAATGTCTAAACTATCGGCTCAACTGATAGTCGCGTTCAAAACTGCCACTCTTGAGTGCAGTAGAGGTAGGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCTGTAACTGACGCTGAGGCTCGAAAGCGTGGGT" 
## [22] "GCAAGCGTTGTCCGGAATTATTGGGCGTAAAGAGTACGTAGGCGGTTTGCTAAGCGCAAGGTGAAAGGCAGTGGCTTAACCATTGTAAGCCTTGCGAACTGACAGACTTGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTTCTGGACTGTAACTGACGCTGAGGTACGAAAGCGTGGGG" 
## [23] "GCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGTTTCATAAGTCTGTCTTAAAAGTGCGGGGCTTAACCCCGTGAGGGGATGGAAACTATGGAACTGGAGTATCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGCTTTCTGGACGACAACTGACGCTGAGGCGCGAAAGCCAGGGG" 
## [24] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGACGGGTTTGCAAGTCAGATGTGAAATACCGCAGCTTAACTGCGGGGCTGCATTTGAAACTGCAAATCTTGAGTGCGGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACCGTAACTGACGTTGAGGCGCGAAAGCGTGGGT"
## [25] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGCCGGAGAGACAAGTCAGATGTGAAATCCGCGGGCTCAACCCGCGAACTGCATTTGAAACTGTTTCCCTTGAGTATCGGAGAGGTAACCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGGTTACTGGACGACAACTGACGGTGAGGCGCGAAAGCGTGGGG"
## [26] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGATCTGCAAGTCAGGCGTGAAATCCATGGGCTTAACCCATGAACTGCGCTTGAAACTGTGGGTCTTGAGTGAAGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGAAATGCGTAGAGATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCTTTAACTGACGCTGAGGCACGAAAGTGTGGGT"
## [27] "GCAAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGAAGAATAAGTCTGATGTGAAAGCCCTCGGCTTAACCGAGGAACTGCATCGGAAACTGTTTTTCTTGAGTGCAGAAGAGGAGAGTGGAACTCCATGTGTAGCGGTGGAATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTCTCTGGTCTGCAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [28] "GCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGTACGTAGGCGGTTTTTTAAGCGAGGGGTATAAGGCAGCGGCTTAACTGCTGTTGGCCCCTCGAACTGGAGGACTTGAGTGTCGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGAGATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACGACAACTGACGCTGAGGTACGAAAGCGTGGGG" 
## [29] "GCGAGCGTTATCCGGAATTATTGGGTGTAAAGGGTGCGTAGGCGGGATGTAAAGTCAGATGTGAAATGCCGCGGCTCAACCGCGGAGCTGCATTTGAAACTTATGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAGGCGGCTTACTGGGCTTAGACTGACGCTGAGGCACGAAAGTGTGGGG"
## [30] "GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGCGGGCTTTTAAGTCAGCGGTCAAATGTCACGGCTCAACCGTGGCCAGCCGTTGAAACTGCAAGCCTTGAGTCTGCACAGGGCACATGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATCGCGAAGGCATTGTGCCGGGGCATAACTGACGCTGAGGCTCGAAAGTGCGGGT" 
## [31] "GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGCGCGTAGGCGGGACGTCAAGTCAGCGGTAAAAGACTGCAGCTAAACTGTAGCACGCCGTTGAAACTGGCGCCCTCGAGACGAGACGAGGGAGGCGGAACAAGTGAAGTAGCGGTGAAATGCATAGATATCACTTGGAACCCCGATAGCGAAGGCAGCTTCCCAGGCTCGATCTGACGCTGATGCGCGAGAGCGTGGGT" 
## [32] "GCGAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGATAGCAAGTCAGATGTGAAAACTATGGGCTCAACCTGTAGATTGCATTTGAAACTGTTGTTCTTGAGTGAAGTAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACATCGGTGGCGAAGGCGGCTTACTGGGCTTTTACTGACGCTGAGGCTCGAAAGCGTGGGG"
## [33] "GCGAGCGTTGTCCGGAATTATTGGGCGTAAAGGGCTTGTAGGTGGCTGGTTGCGTCTGTCGTGAAAGCTCATGGCTTAACTGTGGGTTTGCGGTGGGTACGGGCTGGCTTGAGTGCAGTAGGGGAGGCTGGAATTCCTGGTGTAGCGGTGGAATGCGCAGATATCAGGAGGAATACCGGTGGCGAAGGCGGGTCTCTGGGCTGTTACTGACACTGAGGAGCGAAAGCATGGGG"
## [34] "GCTAGCGTTATCCGGATTTACTGGGCGTAAAGGGTGCGTAGGTGGTTTCTTAAGTCAGGAGTGAAAGGCTACGGCTTAACCGTAGTAAGCTCTTGAAACTGGGAAACTTGAGTGCAGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTAGCGAAGGCGGCTTTCTGGACTGTAACTGACACTGAGGCACGAAAGCGTGGGG" 
## [35] "t__217134"                                                                                                                                                                                                                                
## [36] "t__262755"                                                                                                                                                                                                                                
## [37] "t__264494"                                                                                                                                                                                                                                
## [38] "t__266763"                                                                                                                                                                                                                                
## [39] "t__520"
#format asv table with timepoint + hostname info (CSS now)
asv_table<-t(otu_table(ps_CSS_norm_pass_min_postDD_sup003))
asv_table <- as.data.frame(asv_table)
asv_table$timepoint <- ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Within.study.sampling.date
asv_table$timepoint <- as.factor(asv_table$timepoint)
asv_table$Host.Name <- ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Host.Name

#Create initial table of first asv to build off of
tmp <-summary(aov(asv_table[,1] ~ timepoint + Error(Host.Name/timepoint), data = asv_table))  
p_val <-tmp$`Error: Host.Name:timepoint`[[1]]$`Pr(>F)`[1]
anova_asv_res <- tibble(colnames(asv_table[1]), p_val)
colnames(anova_asv_res) <- c("ASV", "p_val")

#Run for each asv
for (i in 2:(length(colnames(asv_table))-2)) {
  form <-as.formula(paste0("asv_table[," , i, "]", " ~ timepoint + Error(Host.Name/timepoint)"))
  tmp <-summary(aov(formula = form, data = asv_table))  
  p_val <-tmp$`Error: Host.Name:timepoint`[[1]]$`Pr(>F)`[1]
  tmpres <- tibble(colnames(asv_table[i]), p_val)
  colnames(tmpres) <- c("ASV", "p_val")
  anova_asv_res<- rbind(anova_asv_res, tmpres)
}

#find sig ones btwn timepoints
asv_sig_btwn_timep_CSS<-anova_asv_res$ASV[which(anova_asv_res$p_val <= 0.05)]

asv_sig_btwn_timep_CSS
##  [1] "GCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTCTGTCAAGTCGGATGTGAAATCCCCGGGCTCAACCTGGGAACTGCATTCGAAACTGGCAGGCTAGAGTCTTGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAATACCGGTGGCGAAGGCGGCCCCCTGGACAAAGACTGACGCTCAGGTGCGAAAGCGTGGGG"
##  [2] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGAAGGCTAAGTCTGATGTGAAAGCCCGGGGCTCAACCCCGGTACTGCATTGGAAACTGGTCATCTAGAGTGTCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTGGGG"
##  [3] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGATGGCTAAGTCTGATGTGAAAGCCCGGGGCTCAACCCCGGGACTGCATTGGAAACTGGTTATCTTGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGACAACTGACGCTGAGGCTCGAAAGCGTGGGG"
##  [4] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGCAGGCGGTTTGGCAAGTCTGATGTGAAAATCCGGGGCTCAACCCCGGAACTGCATTGGAAACTGTCAGACTAGAGTGTCGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGATAACTGACGCTGAGGCTCGAAAGCGTGGGG"
##  [5] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAAGAGCAAGTCTGATGTGAAAGGCTGGGGCTTAACCCCAGGACTGCATTGGAAACTGTTGTTCTAGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
##  [6] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGATGCAAGTCTGGAGTGAAAGCCCAGGGCTCAACCCTGGGACTGCTTTGGAAACTGTATGGCTAGAGTGCTGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACAGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
##  [7] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGTTTGACAAGTCAGAAGTGAAAGCCCGTGGCTCAACTGCGGGACTGCTTTTGAAACTGTGAGACTGGATTGCAGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAAATGACGCTGAGGCTCGAAAGCGTGGGG"
##  [8] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGCAGTGCAAGTCAGATGTGAAAGGCCGGGGCTCAACCCCGGAGCTGCATTTGAAACTGCTCGGCTAGAGTACAGGAGAGGCAGGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACTGTTACTGACACTGAGGCACGAAAGCGTGGGG"
##  [9] "GCAAGCGTTATCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCTCGGCTTAACCGAGGAAGCGCATCGGAAACTGGGAAACTTGAGTGCAGAAGAGGACAGTGGAACTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTGTCTGGTCTGTAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [10] "GCAAGCGTTATCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTTTTAAGTCTGATGTGAAAGCCCTCGGCTTAACCGAGGAAGTGCATCGGAAACTGGGAAACTTGAGTGCAGAAGAGGACAGTGGAACTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAAGAACACCAGTGGCGAAGGCGGCTGTCTGGTCTGTAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [11] "GCAAGCGTTGTCCGGAATCATTGGGCGTAAAGAGTTCGTAGGCGGCATGTCAAGTCTGGTGTTAAATCCTGAGGCTCAACTTCAGTTCAGCACTGGATACTGGCAAGCTAGAATGCGGTAGAGGTAAAGGGAATTCCTGGTGTAGCGGTGAAATGCGTAGATATCAGGAAGAACACCGGTGGCGTAAGCGCTTTACTGGGCCGTTATTGACGCTGAGGAACGAAAGCCGGGGG"
## [12] "GCAAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGCCGTTTAAGTCAGATGTGAAATACCCGTGCTTAACATGGGGGCTGCATTTGAAACTGGATGGCTTGAGTGCAGGAGAGGCAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTGCTGGACTGTAACTGACGCTGAGGCACGAAAGCGTGGGG"
## [13] "GCAAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGCGCGTTAAGTCAGATGTGAAATACCCGTGCTTAACATGGGGGGTGCATCTGATACTGGCGTGCTTGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGTAACTGACGCTGAGGCACGAAAGCGTGGGG"
## [14] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAGAGCAAGTTGGAAGTGAAATCTGTGGGCTCAACTCACAAATTGCTTTCAAAACTGTTTTTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [15] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGATCGTAAGTTGGGAGTGAAATTCATGGGCTCAACCCATGACCTGCTTTCAAAACTGCGATTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [16] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGTGATCAAGTCAGCTGTGAAAACTACGGGCTTAACCCGTAGACTGCAGTTGAAACTGTTCATCTTGAGTGAAGTAGAGGTTGGCGGAATTCCGAGTGTAGCGGTGAAATGCGTAGATATTCGGAGGAACACCGGTGGCGAAGGCGGCCAACTGGGCTTTAACTGACGCTGAGGCTCGAAAGTGTGGGG"
## [17] "GCAAGCGTTGTCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCCGATCAAGTCAGGTGTGAAAGACCCGTGCTTAACATGGGGGTTGCACTTGAAACTGGTTGGCTTGAGTATGGGAGAGGCAAGCGGAATTCCCGGTGTAGCGGTGAAATGCGTAGAGATCGGGAGGAACACCAGTGGCGAAGGCGGCTTGCTGGACCAAAACTGACGCTGAGGCACGAAAGCGTGGGT"
## [18] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGAAGCAAGTCAGAAGTGAAATCCATGGGCTTAACCCATGAACTGCTTTTGAAACTGTTTCCCTTGAGTATCGGAGAGGCAGGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTGGGG"
## [19] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGATGCAAGTCAGATGTGAAATCCCCGGGCTTAACCCGGGAACTGCATTTGAAACTGTATCCCTTGAGTATCGGAGAGGCAGGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTGGGG"
## [20] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGATGCAAGTCAGATGTGAAATCCTCGGGCTTAACCCGGGAACTGCATTTGAAACTGTATCCCTTGAGTATCGGAGAGGCAGGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTGGGG"
## [21] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGCTTTGCAAGTCAGATGTGAAATCTATGGGCTCAACCCATAAACTGCATTTGAAACTGTAGAGCTTGAGTGAAGTAGAGGCAGGCGGAATTCCCCGTGTAGCGGTGAAATGCGTAGAGATGGGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGGCTTTAACTGACGCTGAGGCACGAAAGCGTGGGT"
## [22] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGTTCGGCAAGTCAGAAGTGAAATCCATGGGCTTAACCCATGAACTGCTTTTGAAACTGTCGAACTTGAGTGAAGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGAAATGCGTAGAGATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCTTTAACTGACGCTGAGGCACGAAAGCATGGGT"
## [23] "GCGAGCGTTATCCGGAATTATTGGGTGTAAAGCGTGTGTAGGCGGGAAATTAAGTCTAAGGTCTAAGCCCGGAGCTCAACTCCGGTTCGCCTTAGAAACTGATTTTCTTGAGTGTGGTAGAGGCAAACGGAATTTCTAGTGTAGCGGTAAAATGCGTAGATATTAGAAGGAACACCAGTGGCGAAGGCGGTTTGCTGGGCCACTACTGACGCTGAGACACGAAAGCGTGGGG" 
## [24] "t__262755"                                                                                                                                                                                                                                
## [25] "t__520"                                                                                                                                                                                                                                   
## [26] "t__81974"
#7 in common
asv_sig_btwn_timep_DES[asv_sig_btwn_timep_DES %in% asv_sig_btwn_timep_CSS]
## [1] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGATGCAAGTCTGGAGTGAAAGCCCAGGGCTCAACCCTGGGACTGCTTTGGAAACTGTATGGCTAGAGTGCTGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACAGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
## [2] "GCAAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGCCGTTTAAGTCAGATGTGAAATACCCGTGCTTAACATGGGGGCTGCATTTGAAACTGGATGGCTTGAGTGCAGGAGAGGCAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGACTTGCTGGACTGTAACTGACGCTGAGGCACGAAAGCGTGGGG"
## [3] "GCAAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGCGCGTTAAGTCAGATGTGAAATACCCGTGCTTAACATGGGGGGTGCATCTGATACTGGCGTGCTTGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGTAACTGACGCTGAGGCACGAAAGCGTGGGG"
## [4] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAGAGCAAGTTGGAAGTGAAATCTGTGGGCTCAACTCACAAATTGCTTTCAAAACTGTTTTTCTTGAGTGGTGTAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACTAACTGACGCTGAGGCTCGAAAGCATGGGT"
## [5] "GCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGTGATCAAGTCAGCTGTGAAAACTACGGGCTTAACCCGTAGACTGCAGTTGAAACTGTTCATCTTGAGTGAAGTAGAGGTTGGCGGAATTCCGAGTGTAGCGGTGAAATGCGTAGATATTCGGAGGAACACCGGTGGCGAAGGCGGCCAACTGGGCTTTAACTGACGCTGAGGCTCGAAAGTGTGGGG"
## [6] "t__262755"                                                                                                                                                                                                                                
## [7] "t__520"
#remove these taxa from phyloseqs (since they are noise essentially)
filtered_ps003 <-prune_taxa(filtered_ps003, taxa = taxa_names(filtered_ps003)[!(taxa_names(filtered_ps003) %in% asv_sig_btwn_timep_DES)])

ps_CSS_norm_pass_min_postDD_sup003<- prune_taxa(ps_CSS_norm_pass_min_postDD_sup003, taxa = taxa_names(ps_CSS_norm_pass_min_postDD_sup003)[!(taxa_names(ps_CSS_norm_pass_min_postDD_sup003) %in% asv_sig_btwn_timep_CSS)])

ps_DeSeq_norm_pass_min_postDD_sup003<- prune_taxa(ps_DeSeq_norm_pass_min_postDD_sup003, taxa = taxa_names(ps_DeSeq_norm_pass_min_postDD_sup003)[!(taxa_names(ps_DeSeq_norm_pass_min_postDD_sup003) %in% asv_sig_btwn_timep_DES)])

4 DA taxa identification

Identify bacterial and archaeal taxa (genera, species and strains) whose abundance is observed significantly more or less in the ASD

4.1 DESeq

dir.create(paste0(output_data, 'DESeq/'))
###Run DESeq proper (not the normalization but all of it)
runDESeq <- function(ps, dcut){
  diagdds = phyloseq_to_deseq2(ps, ~ phenotype) 
  diagdds <- estimateSizeFactors(diagdds, type = "poscounts")
  diagdds <- DESeq(diagdds,fitType="parametric", betaPrior = FALSE) 
  res = results(diagdds, contrast = c("phenotype", "N", "A"))
  res$padj[is.na(res$padj)] = 1
  sig <- res[res$padj < dcut,]
  if (dim(sig)[1] == 0) 
  {sigtab<- as.data.frame(1, row.names="nothing")
    colnames(sigtab) <- 'padj'}
    else 
    {
      sigtab <- data.frame(sig)
      }
  return(list(res, sigtab))
}


###Running analysis 
###split thedata based on the real 3 timepoints
P1<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 1"], filtered_ps003)
P2<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 2"], filtered_ps003)
P3<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 3"], filtered_ps003)

#several significants 
deseq_res_P1 <- runDESeq(P1, deseq_cut) 
deseq_res_P2 <- runDESeq(P2, deseq_cut)
deseq_res_P3 <- runDESeq(P3, deseq_cut)

# print significant taxa
datatable(deseq_res_P1[[2]])
datatable(deseq_res_P2[[2]])
datatable(deseq_res_P3[[2]])
#"ASV_1669" present twice timepoint 1 and 3 

# save
saveRDS(deseq_res_P1, file=paste0(output_data, "DESeq/deseq_res_P1_adjp", deseq_cut, ".Rda"))
saveRDS(deseq_res_P2, file=paste0(output_data, "DESeq/deseq_res_P2_adjp", deseq_cut, ".Rda"))
saveRDS(deseq_res_P3, file=paste0(output_data, "DESeq/deseq_res_P3_adjp", deseq_cut, ".Rda"))

#Working with time series
#according to the DeSeq vignette: design including the time factor, and then test using the likelihood ratio test as described
#the following section, where the time factor is removed in the reduced formula
runDESeq_time <- function(ps, dcut){
  diagdds = phyloseq_to_deseq2(ps, ~ phenotype + Within.study.sampling.date) 
  diagdds <- estimateSizeFactors(diagdds, type = "poscounts")
  diagdds <- DESeq(diagdds,fitType="parametric", betaPrior = FALSE) 
  #resultsNames(diagdds): to determine the constrast
  res = results(diagdds, contrast = c("phenotype", "A", "N"))
  res$padj[is.na(res$padj)] = 1
  sig <- res[res$padj < dcut,]
  if (dim(sig)[1] == 0) 
  {sigtab<- as.data.frame(1, row.names="nothing")
  colnames(sigtab) <- 'padj'}
  else 
  {
    sigtab <- data.frame(sig)
  }
  return(list(res, sigtab))
}

#and this time when factoring in the interaction for longitudinal study! 
bla<-runDESeq_time(filtered_ps003, deseq_cut)
saveRDS(bla, file=paste0(output_data, "DESeq/Deseq_time_interaction_adjp", deseq_cut, ".Rda"))

datatable(bla[2][[1]])
# significant ASVs
row.names(bla[2][[1]])
## [1] "ACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTATGGGCTTAACCCATAAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTGGGT"
## [2] "t__262551"
des_res<-cbind(row.names(bla[2][[1]]), as(tax_table(filtered_ps003)[row.names(bla[2][[1]]), ], "matrix"))
des_res_esv <- des_res
rownames(des_res) <- NULL
des_res
##                                                                                                                                                                                                                                                 
## [1,] "ACAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGCAGGCGGGAAGACAAGTTGGAAGTGAAATCTATGGGCTTAACCCATAAACTGCTTTCAAAACTGTTTTTCTTGAGTAGTGCAGAGGTAGGCGGAATTCCCGGTGTAGCGGTGGAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCCTACTGGGCACCAACTGACGCTGAGGCTCGAAAGTGTGGGT"
## [2,] "t__262551"                                                                                                                                                                                                                                
##      Domain        Phylum            Class           Order                  
## [1,] "d__Bacteria" "p__Firmicutes_A" "c__Clostridia" "o__Oscillospirales"   
## [2,] "d__Bacteria" "p__Firmicutes"   "c__Bacilli"    "o__Erysipelotrichales"
##      Family                   Genus                 Species                    
## [1,] "f__Ruminococcaceae"     "g__Faecalibacterium" "s__unclassified"          
## [2,] "f__Erysipelotrichaceae" "g__Holdemanella"     "s__Holdemanella__biformis"
##      Strain           
## [1,] "t__unclassified"
## [2,] "t__262551"

4.2 metagenomeSeq

dir.create(paste0(output_data, 'metagenomseq/'))
###Run ZIG model fitting and prediction
run_metagenom_seq<-function(ps,maxit, mcut){
  p_metag<-phyloseq_to_metagenomeSeq(ps)
  #filtering at least 4 samples 
  p_metag= cumNorm(p_metag, p=0.75)
  normFactor =normFactors(p_metag)
  normFactor =log2(normFactor/median(normFactor) + 1)
  #mod = model.matrix(~ASDorNeuroT +PairASD+ normFactor)
  mod = model.matrix(~phenotype + normFactor, data = pData(p_metag))
  settings =zigControl(maxit =maxit, verbose =FALSE)
  #settings =zigControl(tol = 1e-5, maxit = 30, verbose = TRUE, pvalMethod = 'bootstrap')
  fit =fitZig(obj = p_metag, mod = mod, useCSSoffset = FALSE, control = settings)
  #Note: changed fit$taxa to fit@taxa in light of error (probably from newer metagenomeseq ver.)
  res_fit<-MRtable(fit, number = length(fit@taxa))
  res_fit_nonfiltered <- copy(res_fit)
  res_fit<-res_fit[res_fit$adjPvalues<mcut,]
  #finally remove the ones that are not with enough samples
  #mean_sample<-mean(calculateEffectiveSamples(fit))
  #res_fit<-res_fit[res_fit$`counts in group 0` & res_fit$`counts in group 1` > mean_sample,]
  Min_effec_samp<-calculateEffectiveSamples(fit)
  Min_effec_samp<-Min_effec_samp[ names(Min_effec_samp)  %in% rownames(res_fit)] #####there is a bug here 
  #manually removing the ones with "NA"
  res_fit<-res_fit[grep("NA",rownames(res_fit), inv=T),]
  res_fit$Min_sample<-Min_effec_samp
  res_fit<-res_fit[res_fit$`+samples in group 0` >= Min_effec_samp & res_fit$`+samples in group 1` >= Min_effec_samp,]
  return(list(res_fit_nonfiltered, res_fit))
}

run_metagenom_seq2<-function(ps,maxit, mcut){
  p_metag<-phyloseq_to_metagenomeSeq(ps)
  #filtering at least 4 samples 
  p_metag= cumNorm(p_metag, p=0.75)
  normFactor =normFactors(p_metag)
  normFactor =log2(normFactor/median(normFactor) + 1)
  #mod = model.matrix(~ASDorNeuroT +PairASD+ normFactor)
  mod = model.matrix(~phenotype + Within.study.sampling.date +normFactor, data = pData(p_metag))
  settings =zigControl(maxit =maxit, verbose =FALSE)
  #settings =zigControl(tol = 1e-5, maxit = 30, verbose = TRUE, pvalMethod = 'bootstrap')
  fit =fitZig(obj = p_metag, mod = mod, useCSSoffset = FALSE, control = settings)
  #Note: changed fit$taxa to fit@taxa in light of error (probably from newer metagenomeseq ver.)
  res_fit<-MRtable(fit, number = length(fit@taxa))
  res_fit_nonfiltered <- copy(res_fit)
  res_fit<-res_fit[res_fit$adjPvalues<mcut,]
  #finally remove the ones that are not with enough samples
  #mean_sample<-mean(calculateEffectiveSamples(fit))
  #res_fit<-res_fit[res_fit$`counts in group 0` & res_fit$`counts in group 1` > mean_sample,]
  Min_effec_samp<-calculateEffectiveSamples(fit)
  Min_effec_samp<-Min_effec_samp[ names(Min_effec_samp)  %in% rownames(res_fit)] #####there is a bug here 
  #manually removing the ones with "NA"
  res_fit<-res_fit[grep("NA",rownames(res_fit), inv=T),]
  res_fit$Min_sample<-Min_effec_samp
  res_fit<-res_fit[res_fit$`+samples in group 0` >= Min_effec_samp & res_fit$`+samples in group 1` >= Min_effec_samp,]
  return(list(res_fit_nonfiltered, res_fit))
}

#Now for each time
P1<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 1"], filtered_ps003)
P2<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 2"], filtered_ps003)
P3<-prune_samples(rownames(sample_data(filtered_ps003))[sample_data(filtered_ps003)$Within.study.sampling.date == "Timepoint 3"], filtered_ps003)

zig_res_P1 <- run_metagenom_seq(P1,30, mtgseq_cut) # 30The maximum number of iterations for the expectation-maximization algorithm
zig_res_P2 <- run_metagenom_seq(P2,30, mtgseq_cut)
zig_res_P3 <- run_metagenom_seq(P3,30, mtgseq_cut)

zig_res_all<- run_metagenom_seq2(filtered_ps003,30, mtgseq_cut)

# print significant taxa
datatable(zig_res_P1[[2]])
datatable(zig_res_P2[[2]])
datatable(zig_res_P3[[2]])
datatable(zig_res_all[[2]])
zig_res_P1_filtered <- data.frame(cbind(zig_res_P1[[2]], tax_table(P1)[rownames(zig_res_P1[[2]]),]))
zig_res_P1_filtered$enriched <- ifelse(zig_res_P1_filtered$phenotypeN < 0, "Aut", "Control")

zig_res_P2_filtered <- data.frame(cbind(zig_res_P2[[2]], tax_table(P2)[rownames(zig_res_P2[[2]]), ]))
zig_res_P2_filtered$enriched <- ifelse(zig_res_P2_filtered$phenotypeN < 0, "Aut", "Control")

zig_res_P3_filtered <- data.frame(cbind(zig_res_P3[[2]], tax_table(P3)[rownames(zig_res_P3[[2]]), ]))
zig_res_P3_filtered$enriched <- ifelse(zig_res_P3_filtered$phenotypeN < 0, "Aut", "Control")

zig_res_all_filtered <- data.frame(cbind(zig_res_all[[2]], tax_table(filtered_ps003)[rownames(zig_res_all[[2]]), ]))
zig_res_all_filtered$enriched <- ifelse(zig_res_all_filtered$phenotypeN < 0, "Aut", "Control")


saveRDS(zig_res_P1, file=paste0(output_data, "metagenomseq/zig_res_P1_adjp", mtgseq_cut, ".rds"))
saveRDS(zig_res_P2, file=paste0(output_data, "metagenomseq/zig_res_P2_adjp", mtgseq_cut, ".rds"))
saveRDS(zig_res_P3, file=paste0(output_data, "metagenomseq/zig_res_P3_adjp", mtgseq_cut, ".rds"))

#do we have any in ESV in common?
#Reduce(intersect, list(rownames(zig_res_P1_filtered),rownames(zig_res_P2_filtered),rownames(zig_res_P3_filtered)))
#rownames(zig_res_P1[[2]])[which(rownames(zig_res_P1[[2]]) %in% c(rownames(zig_res_P2[[2]]), rownames(zig_res_P3[[2]])))]
#rownames(zig_res_P2[[2]])[which(rownames(zig_res_P2[[2]]) %in% rownames(zig_res_P3[[2]]))]

metag_res<-cbind(rownames(zig_res_all[[2]]), as(tax_table(filtered_ps003)[rownames(zig_res_all[[2]]), ], "matrix"))
metag_res_esv <-metag_res
rownames(metag_res) <- NULL
metag_res
##                                                                                                                                                                                                                                                  
##  [1,] "GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGTGGTGATTTAAGTCAGCGGTGAAAGTTTGTGGCTCAACCATAAAATTGCCGTTGAAACTGGGTTACTTGAGTGTGTTTGAGGTAGGCGGAATGCGTGGTGTAGCGGTGAAATGCATAGATATCACGCAGAACTCCGATTGCGAAGGCAGCTTACTAAACCATAACTGACACTGAAGCACGAAAGCGTGGGG"
##  [2,] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGTGCGTAGGCGGATTGGCAAGTCAGTAGTGAAATCCATGGGCTTAACCCATGACGTGCTATTGAAACTGTTGATCTTGAGTGAAGTAGAGGTAAGCGGAATTCCCGGTGTAGCGGTGAAATGCGTAGAGATCGGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGGCTTTAACTGACGCTGAGGCACGAAAGCATGGGT"
##  [3,] "GCAAGCGTTGTCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGAGATGCAAGTCAGATGTGAAATCCTCGGGCTTAACCCGGGAACTGCATTTGAAACTGTATCCCTTGAGTATCGGAGAGGCAGGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACGACAACTGACGCTGAGGCGCGAAAGCGTGGGG"
##  [4,] "GCAAGCGTTATCCGGAATGACTGGGCGTAAAGGGTGCGTAGGTGGTTTGTCAAGTTGGCAGCGTAATTCCGTGGCTTAACCGCGGAACTACTGCCAAAACTGATAGGCTTGAGTGCGGCAGGGGTATGTGGAATTCCTAGTGTAGCGGTGGAATGCGTAGATATTAGGAGGAACACCGGTGGCGAAAGCGACATACTGGGCCGTAACTGACACTGAAGCACGAAAGCGTGGGG"
##  [5,] "t__186843"                                                                                                                                                                                                                                
##  [6,] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCGAGGCAAGTCTGATGTGAAAACCCGGGGCTCAACCCCGTGACTGCATTGGAAACTGTTTTGCTTGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGCAACTGACGTTGAGGCTCGAAAGCGTGGGG"
##  [7,] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGTGTAGGTGGTATCACAAGTCAGAAGTGAAAGCCCGGGGCTCAACCCCGGGACTGCTTTTGAAACTGTGGAACTGGAGTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAACTGACACTGAGGCTCGAAAGCGTGGGG"
##  [8,] "GCGAGCGTTATCCGGATTCATTGGGCGTAAAGCGCGCGTAGGCGGCCCGGCAGGCCGGGGGTCGAAGCGGGGGGCTCAACCCCCCGAAGCCCCCGGAACCTCCGCGGCTTGGGTCCGGTAGGGGAGGGTGGAACACCCGGTGTAGCGGTGGAATGCGCAGATATCGGGTGGAACACCGGTGGCGAAGGCGGCCCTCTGGGCCGAGACCGACGCTGAGGCGCGAAAGCTGGGGG"
##  [9,] "TCAAGCGTTGTTCGGAATCACTGGGCGTAAAGCGTGCGTAGGCTGTTTCGTAAGTCGTGTGTGAAAGGCGCGGGCTCAACCCGCGGACGGCACATGATACTGCGAGACTAGAGTAATGGAGGGGGAACCGGAATTCTCGGTGTAGCAGTGAAATGCGTAGATATCGAGAGGAACACTCGTGGCGAAGGCGGGTTCCTGGACATTAACTGACGCTGAGGCACGAAGGCCAGGGG"
## [10,] "GCAAGCGTTGTCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGATTGGTCAGTCTGTCTTAAAAGTTCGGGGCTTAACCCCGTGATGGGATGGAAACTGCCAATCTAGAGTATCGGAGAGGAAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGACTTTCTGGACGAAAACTGACGCTGAGGCGCGAAAGCCAGGGG" 
## [11,] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGAATGGCAAGTCTGATGTGAAAGGCCGGGGCTCAACCCCGGGACTGCATTGGAAACTGTCAATCTAGAGTACCGGAGGGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
## [12,] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGACTGGCAAGTCTGATGTGAAAGGCGGGGGCTCAACCCCTGGACTGCATTGGAAACTGTTAGTCTTGAGTGCCGGAGAGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
## [13,] "CCGAGCGTTGTCCGGATTTATTGGGCGTAAAGCGAGCGCAGGCGGTTTGATAAGTCTGAAGTTAAAGGCTGTGGCTCAACCATAGTTCGCTTTGGAAACTGTCAAACTTGAGTGCAGAAGGGGAGAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGATATATGGAGGAACACCGGTGGCGAAAGCGGCTCTCTGGTCTGTAACTGACGCTGAGGCTCGAAAGCGTGGGG" 
## [14,] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGCGTGTAGGCGGGAGTGCAAGTCAGATGTGAAAACTATGGGCTCAACCCATAGCCTGCATTTGAAACTGTACTTCTTGAGTGATGGAGAGGCAGGCGGAATTCCCTGTGTAGCGGTGAAATGCGTAGATATAGGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACATTAACTGACGCTGAGGCGCGAAAGCGTGGGG"
## [15,] "CCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGATTGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGAAACTGGCAGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTAGACTGCAACTGACACTGATGCTCGAAAGTGTGGGT"
## [16,] "t__258798"                                                                                                                                                                                                                                
## [17,] "GCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGGCGGTTTATTAAGTTAGTGGTTAAATATTTGAGCTAAACTCAATTGTGCCATTAATACTGGTAAACTGGAGTACAGACGAGGTAGGCGGAATAAGTTAAGTAGCGGTGAAATGCATAGATATAACTTAGAACTCCGATAGCGAAGGCAGCTTACCAGACTGTAACTGACGCTGATGCACGAGAGCGTGGGT" 
## [18,] "CCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGAGCGTAGGTGGACAGTTAAGTCAGTTGTGAAAGTTTGCGGCTCAACCGTAAAATTGCAGTTGATACTGGCTGTCTTGAGTACAGTAGAGGTGGGCGGAATTCGTGGTGTAGCGGTGAAATGCTTAGATATCACGAAGAACTCCGATTGCGAAGGCAGCTCACTGGACTGCAACTGACACTGATGCTCGAAAGTGTGGGT"
## [19,] "GCGAGCGTTAATCGGAATAACTGGGCGTAAAGGGCACGCAGGCGGTGACTTAAGTGAGGTGTGAAAGCCCCGGGCTTAACCTGGGAATTGCATTTCATACTGGGTCGCTAGAGTACTTTAGGGAGGGGTAGAATTCCACGTGTAGCGGTGAAATGCGTAGAGATGTGGAGGAATACCGAAGGCGAAGGCAGCCCCTTGGGAATGTACTGACGCTCATGTGCGAAAGCGTGGGG"
## [20,] "GCGAGCGTTATCCGGAATTACTGGGTGTAAAGGGTGCGTAGGCGGCACCGTAAGTCTGTTGTGAAAGGCGATGGCTTAACCATCGAAGTGCAATGGAAACTGCGGAGCTAGAGTGCAGGAGAGGAAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAAGAACACCAGTGGCGAAGGCGGCTTTCTGGACTGCAACTGACGCTGAGGCACGAAAGCGTGGGG"
## [21,] "GCGAGCGTTATCCGGAATTATTGGGCGTAAAGAGCGCGCAGGTGGTTGATTAAGTCTGATGTGAAAGCCCACGGCTTAACCGTGGAGGGTCATTGGAAACTGGTCAACTTGAGTGCAGAAGAGGGAAGTGGAATTCCATGTGTAGCGGTGAAATGCGTAGAGATATGGAGGAACACCAGTGGCGAAGGCGGCTTCCTGGTCTGTAACTGACACTGAGGCGCGAAAGCGTGGGG"
## [22,] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGTGTGGCAAGTCTGATGTGAAAGGCATGGGCTCAACCTGTGGACTGCATTGGAAACTGTCATACTTGAGTGCCGGAGGGGTAAGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGGTAACTGACGTTGAGGCTCGAAAGCGTGGGG"
## [23,] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGAGCGTAGGCGGTCTGACAAGTCAGAAGTGAAAGCCCGGGGCTCAACTCCGGGACTGCTTTTGAAACTGCCGGACTAGATTGCAGGAGAGGTAAGTGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACTGTAAATGACGCTGAGGCTCGAAAGCGTGGGG"
## [24,] "t__220541"                                                                                                                                                                                                                                
## [25,] "t__209626"                                                                                                                                                                                                                                
## [26,] "GCAAGCGTTATCCGGATTTACTGGGTGTAAAGGGTGCGTAGGTGGCAGTGCAAGTCAGATGTGAAAGGCCGGGGCTCAACCCCGGAGCTGCATTTGAAACTGCATAGCTAGAGTACAGGAGAGGCAGGCGGAATTCCTAGTGTAGCGGTGAAATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGACTGTTACTGACACTGAGGCACGAAAGCGTGGGG"
## [27,] "GCGAGCGTTGTCCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGTCTGAAAAGTCGGATGTGAAATCCCCGTGCTTAACATGGGAGCTGCATTCGAAACTTTCGGACTTGAGTGTCGGAGAGGTAAGCGGAATTCCCGGTGTAGCGGTGAAATGCGTAGATATCGGGAGGAACACCAGTGGCGAAGGCGGCTTACTGGACGACAACTGACGCTGAGGCACGAAAGCGTGGGG"
##       Domain        Phylum                 Class                   
##  [1,] "d__Bacteria" "p__Bacteroidota"      "c__Bacteroidia"        
##  [2,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
##  [3,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
##  [4,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
##  [5,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
##  [6,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
##  [7,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
##  [8,] "d__Bacteria" "p__Actinobacteriota"  "c__Coriobacteriia"     
##  [9,] "d__Bacteria" "p__Verrucomicrobiota" "c__Verrucomicrobiae"   
## [10,] "d__Bacteria" "p__Firmicutes_C"      "c__Negativicutes"      
## [11,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
## [12,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
## [13,] "d__Bacteria" "p__Firmicutes"        "c__Bacilli"            
## [14,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
## [15,] "d__Bacteria" "p__Bacteroidota"      "c__Bacteroidia"        
## [16,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
## [17,] "d__Bacteria" "p__Bacteroidota"      "c__Bacteroidia"        
## [18,] "d__Bacteria" "p__Bacteroidota"      "c__Bacteroidia"        
## [19,] "d__Bacteria" "p__Proteobacteria"    "c__Gammaproteobacteria"
## [20,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
## [21,] "d__Bacteria" "p__Firmicutes"        "c__Bacilli"            
## [22,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
## [23,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
## [24,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
## [25,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
## [26,] "d__Bacteria" "p__Firmicutes_A"      "c__Clostridia"         
## [27,] "d__Bacteria" "p__Firmicutes_A"      "c__unclassified"       
##       Order                   Family                  
##  [1,] "o__Bacteroidales"      "f__Tannerellaceae"     
##  [2,] "o__Oscillospirales"    "f__Acutalibacteraceae" 
##  [3,] "o__Oscillospirales"    "f__Oscillospiraceae"   
##  [4,] "o__unclassified"       "f__unclassified"       
##  [5,] "o__Christensenellales" "f__Christensenellaceae"
##  [6,] "o__Lachnospirales"     "f__Lachnospiraceae"    
##  [7,] "o__Lachnospirales"     "f__Lachnospiraceae"    
##  [8,] "o__Coriobacteriales"   "f__Coriobacteriaceae"  
##  [9,] "o__Verrucomicrobiales" "f__Akkermansiaceae"    
## [10,] "o__Veillonellales"     "f__Veillonellaceae"    
## [11,] "o__Lachnospirales"     "f__Lachnospiraceae"    
## [12,] "o__Lachnospirales"     "f__Lachnospiraceae"    
## [13,] "o__Lactobacillales"    "f__Streptococcaceae"   
## [14,] "o__Oscillospirales"    "f__Oscillospiraceae"   
## [15,] "o__Bacteroidales"      "f__Bacteroidaceae"     
## [16,] "o__Oscillospirales"    "f__Acutalibacteraceae" 
## [17,] "o__Bacteroidales"      "f__Marinifilaceae"     
## [18,] "o__Bacteroidales"      "f__Bacteroidaceae"     
## [19,] "o__Enterobacterales"   "f__Pasteurellaceae"    
## [20,] "o__unclassified"       "f__unclassified"       
## [21,] "o__Haloplasmatales"    "f__Turicibacteraceae"  
## [22,] "o__Lachnospirales"     "f__Lachnospiraceae"    
## [23,] "o__Lachnospirales"     "f__Lachnospiraceae"    
## [24,] "o__Lachnospirales"     "f__Lachnospiraceae"    
## [25,] "o__Lachnospirales"     "f__Lachnospiraceae"    
## [26,] "o__Lachnospirales"     "f__Lachnospiraceae"    
## [27,] "o__unclassified"       "f__unclassified"       
##       Genus                      Species                               
##  [1,] "g__Parabacteroides"       "s__Parabacteroides__merdae"          
##  [2,] "g__unclassified"          "s__unclassified"                     
##  [3,] "g__unclassified"          "s__unclassified"                     
##  [4,] "g__unclassified"          "s__unclassified"                     
##  [5,] "g__PROV_t__186843"        "s__PROV_t__186843"                   
##  [6,] "g__unclassified"          "s__unclassified"                     
##  [7,] "g__unclassified"          "s__unclassified"                     
##  [8,] "g__Collinsella"           "s__unclassified"                     
##  [9,] "g__Akkermansia"           "s__Akkermansia__muciniphila"         
## [10,] "g__Veillonella"           "s__unclassified"                     
## [11,] "g__Ruminococcus_A"        "s__unclassified"                     
## [12,] "g__Blautia_A"             "s__unclassified"                     
## [13,] "g__Streptococcus"         "s__unclassified"                     
## [14,] "g__Intestinimonas"        "s__Intestinimonas__butyriciproducens"
## [15,] "g__Bacteroides"           "s__unclassified"                     
## [16,] "g__Anaeromassilibacillus" "s__PROV_t__258798"                   
## [17,] "g__Odoribacter"           "s__Odoribacter__splanchnicus"        
## [18,] "g__Bacteroides"           "s__Bacteroides__thetaiotaomicron"    
## [19,] "g__Haemophilus_D"         "s__unclassified"                     
## [20,] "g__unclassified"          "s__unclassified"                     
## [21,] "g__Turicibacter"          "s__unclassified"                     
## [22,] "g__Blautia_A"             "s__Blautia_A__wexlerae"              
## [23,] "g__PROV_t__182732"        "s__unclassified"                     
## [24,] "g__Blautia_A"             "s__PROV_t__220541"                   
## [25,] "g__PROV_t__209626"        "s__PROV_t__209626"                   
## [26,] "g__Eubacterium_E"         "s__Eubacterium_E__hallii_A"          
## [27,] "g__unclassified"          "s__unclassified"                     
##       Strain           
##  [1,] "t__unclassified"
##  [2,] "t__unclassified"
##  [3,] "t__unclassified"
##  [4,] "t__unclassified"
##  [5,] "t__186843"      
##  [6,] "t__unclassified"
##  [7,] "t__unclassified"
##  [8,] "t__unclassified"
##  [9,] "t__unclassified"
## [10,] "t__unclassified"
## [11,] "t__unclassified"
## [12,] "t__unclassified"
## [13,] "t__unclassified"
## [14,] "t__unclassified"
## [15,] "t__unclassified"
## [16,] "t__258798"      
## [17,] "t__unclassified"
## [18,] "t__unclassified"
## [19,] "t__unclassified"
## [20,] "t__unclassified"
## [21,] "t__unclassified"
## [22,] "t__unclassified"
## [23,] "t__unclassified"
## [24,] "t__220541"      
## [25,] "t__209626"      
## [26,] "t__unclassified"
## [27,] "t__unclassified"

4.3 ANCOM

#ANCOM_old ver (2.1 can be seen after)
#retrive taxa
#format metadata
#metada_ps<-sample_data(filtered_ps003)
#metada_ps<-as.data.frame(metada_ps)
#res_table_filt<-otu_table(filtered_ps003)
#res_table_filt<-as.data.frame(res_table_filt)
#res_table_filt<-t(res_table_filt)
#res_table_filt<-as.data.frame(res_table_filt)

#res_table_filt$metada<-metada_ps$phenotype[rownames(metada_ps)%in%rownames(res_table_filt)]
#ancom.all.filt.0.05 <- ANCOM(res_table_filt, sig = 0.05, multcorr = 2)
#saveRDS(ancom.all.filt.0.05, file=paste0(output_data, "ANCOM/ancom_res", mtgseq_cut, ".rds"))

#Format the Detected Table w/ Taxa 
#main.05 <- cbind(ancom.all.filt.0.05$detected, as(tax_table(filtered_ps003)[ancom.all.filt.0.05$detected, ], "matrix"))

#main.05esv <- main.05
#row.names(main.05) <- NULL
#main.05



#New ANCOM 2.1

#retrive taxa
#format metadata
metada_ps<-sample_data(filtered_ps003)
metada_ps<-as.data.frame(metada_ps)
metada_ps$HostName <- as.character(metada_ps$Host.Name)
metada_ps$phenotype <- as.character(metada_ps$phenotype)

metada_ps<-tibble(metada_ps$HostName, metada_ps$phenotype, rows = rownames(metada_ps))
colnames(metada_ps) <- c("HostName", "phenotype", "Biospecimen.Barcode")
res_table_filt<-otu_table(filtered_ps003)
res_table_filt<-as.data.frame(res_table_filt)

prepro<-feature_table_pre_process(feature_table = res_table_filt, meta_data = metada_ps, sample_var = "Biospecimen.Barcode", group_var = "phenotype", out_cut = 0.05, zero_cut = 0.90, lib_cut = 1000, neg_lb = FALSE)

feature_table = prepro$feature_table # Preprocessed feature table
meta_data = prepro$meta_data # Preprocessed metadata
struc_zero = prepro$structure_zeros # Structural zero info

main_var = "phenotype"; p_adj_method = "BH"; alpha = 0.05
adj_formula = NULL; rand_formula = "~ 1 | HostName" ; control = list(msMaxIter = 50)
ancom.all.filt.0.05 <- ANCOM(feature_table, meta_data, struc_zero, main_var, p_adj_method, 
            alpha, adj_formula, rand_formula)#, control)
dir.create(path = "ANCOM")
saveRDS(ancom.all.filt.0.05, file=paste0(output_data, "ANCOM/ancom_res", mtgseq_cut, ".rds"))

#Format the Detected Table w/ Taxa 
ancom.all.filt.0.05$detected
## NULL
#no significant taxa from ANCOM



#Trying by each timepoint

#Timepoint 3
ps3<-subset_samples(filtered_ps003, Within.study.sampling.date == "Timepoint 3" )
metada_ps<-sample_data(ps3)
metada_ps<-as.data.frame(metada_ps)
metada_ps$HostName <- as.character(metada_ps$Host.Name)
metada_ps$phenotype <- as.character(metada_ps$phenotype)

metada_ps<-tibble(metada_ps$HostName, metada_ps$phenotype, rows = rownames(metada_ps))
colnames(metada_ps) <- c("HostName", "phenotype", "Biospecimen.Barcode")
res_table_filt<-otu_table(ps3)
res_table_filt<-as.data.frame(res_table_filt)

prepro<-feature_table_pre_process(feature_table = res_table_filt, meta_data = metada_ps, sample_var = "Biospecimen.Barcode", group_var = "phenotype", out_cut = 0.05, zero_cut = 0.90, lib_cut = 1000, neg_lb = FALSE)

feature_table = prepro$feature_table # Preprocessed feature table
meta_data = prepro$meta_data # Preprocessed metadata
struc_zero = prepro$structure_zeros # Structural zero info

main_var = "phenotype"; p_adj_method = "BH"; alpha = 0.05
adj_formula = NULL; rand_formula = NULL
ancom.all.filt.0.05.3 <- ANCOM(feature_table, meta_data, struc_zero, main_var, p_adj_method, 
            alpha, adj_formula, rand_formula)#, control)
#dir.create(path = "ANCOM")
saveRDS(ancom.all.filt.0.05.3, file=paste0(output_data, "ANCOM/ancom_res3_", mtgseq_cut, ".rds"))

#Format the Detected Table w/ Taxa 
ancom.all.filt.0.05.3$detected
## NULL
# No sig


#Timepoint 2

ps2<-subset_samples(filtered_ps003, Within.study.sampling.date == "Timepoint 2" )
metada_ps<-sample_data(ps2)
metada_ps<-as.data.frame(metada_ps)
metada_ps$HostName <- as.character(metada_ps$Host.Name)
metada_ps$phenotype <- as.character(metada_ps$phenotype)

metada_ps<-tibble(metada_ps$HostName, metada_ps$phenotype, rows = rownames(metada_ps))
colnames(metada_ps) <- c("HostName", "phenotype", "Biospecimen.Barcode")
res_table_filt<-otu_table(ps2)
res_table_filt<-as.data.frame(res_table_filt)

prepro<-feature_table_pre_process(feature_table = res_table_filt, meta_data = metada_ps, sample_var = "Biospecimen.Barcode", group_var = "phenotype", out_cut = 0.05, zero_cut = 0.90, lib_cut = 1000, neg_lb = FALSE)

feature_table = prepro$feature_table # Preprocessed feature table
meta_data = prepro$meta_data # Preprocessed metadata
struc_zero = prepro$structure_zeros # Structural zero info

main_var = "phenotype"; p_adj_method = "BH"; alpha = 0.05
adj_formula = NULL; rand_formula = NULL
ancom.all.filt.0.05.2 <- ANCOM(feature_table, meta_data, struc_zero, main_var, p_adj_method, 
            alpha, adj_formula, rand_formula)#, control)
#dir.create(path = "ANCOM")
saveRDS(ancom.all.filt.0.05.2, file=paste0(output_data, "ANCOM/ancom_res2_", mtgseq_cut, ".rds"))

#Format the Detected Table w/ Taxa 
ancom.all.filt.0.05.2$detected
## NULL
# No sig



#Timepoint 1

ps1<-subset_samples(filtered_ps003, Within.study.sampling.date == "Timepoint 1" )
metada_ps<-sample_data(ps1)
metada_ps<-as.data.frame(metada_ps)
metada_ps$HostName <- as.character(metada_ps$Host.Name)
metada_ps$phenotype <- as.character(metada_ps$phenotype)

metada_ps<-tibble(metada_ps$HostName, metada_ps$phenotype, rows = rownames(metada_ps))
colnames(metada_ps) <- c("HostName", "phenotype", "Biospecimen.Barcode")
res_table_filt<-otu_table(ps1)
res_table_filt<-as.data.frame(res_table_filt)

prepro<-feature_table_pre_process(feature_table = res_table_filt, meta_data = metada_ps, sample_var = "Biospecimen.Barcode", group_var = "phenotype", out_cut = 0.05, zero_cut = 0.90, lib_cut = 1000, neg_lb = FALSE)

feature_table = prepro$feature_table # Preprocessed feature table
meta_data = prepro$meta_data # Preprocessed metadata
struc_zero = prepro$structure_zeros # Structural zero info

main_var = "phenotype"; p_adj_method = "BH"; alpha = 0.05
adj_formula = NULL; rand_formula = NULL
ancom.all.filt.0.05.1 <- ANCOM(feature_table, meta_data, struc_zero, main_var, p_adj_method, 
            alpha, adj_formula, rand_formula)#, control)
#dir.create(path = "ANCOM")
saveRDS(ancom.all.filt.0.05.1, file=paste0(output_data, "ANCOM/ancom_res1_", mtgseq_cut, ".rds"))

#Format the Detected Table w/ Taxa 
ancom.all.filt.0.05.1$detected
## NULL
# No sig

4.4 Results table summarizing significant taxa

#Deseq results

#Assemble Deseq results into one table
DesP1<-tibble(rownames(deseq_res_P1[[2]]), rep("DESeq2_P1", length(rownames(deseq_res_P1[[2]]))))
colnames(DesP1) <- c("ASV", "Method+Data")

DesP2<-tibble(rownames(deseq_res_P2[[2]]), rep("DESeq2_P2", length(rownames(deseq_res_P2[[2]]))))
colnames(DesP2) <- c("ASV", "Method+Data")

DesP3<-tibble(rownames(deseq_res_P3[[2]]), rep("DESeq2_P3", length(rownames(deseq_res_P3[[2]]))))
colnames(DesP3) <- c("ASV", "Method+Data")

DesAll<-tibble(des_res[,1], rep("DESeq2_All", length(des_res[,1])))
colnames(DesAll) <- c("ASV", "Method+Data")

deseq_allsig<-rbind(DesP1,DesP2, DesP3, DesAll)

#Find ones found multiple times (either in indiviudual timepoints or all together) and add them back in with correct labeling

deseqcount<-plyr::ddply(deseq_allsig, "ASV", transform, count = length(ASV))
duplic<-tibble(deseqcount$ASV[which(deseqcount$count == 2)], deseqcount$Method.Data[which(deseqcount$count == 2)])
a<-tibble(duplic$`deseqcount$ASV[which(deseqcount$count == 2)]`[1], "DESeq2_P3+All")
colnames(a) <- c("ASV", "Method+Data")

b<-tibble(duplic$`deseqcount$ASV[which(deseqcount$count == 2)]`[3], "DESeq2_P1+P3")
colnames(b) <- c("ASV", "Method+Data")

to_add_backin<-rbind(a,b)

deseq_allsig<-deseq_allsig[-which(deseq_allsig$ASV == deseqcount$ASV[which(deseqcount$count == 2)][1]),]
deseq_allsig<-deseq_allsig[-which(deseq_allsig$ASV == deseqcount$ASV[which(deseqcount$count == 2)][3]),]

deseq_allsig<-rbind(deseq_allsig, to_add_backin)
deseq_allsig<-cbind(deseq_allsig, as(tax_table(filtered_ps003)[deseq_allsig$ASV, ], "matrix"))
saveRDS(deseq_allsig, "DESeq/full_res_table_deseq")

deseq_allsig.print <- deseq_allsig
deseq_allsig.print$ASV <- NULL
rownames(deseq_allsig.print) <- NULL
deseq_allsig.print
##      Method+Data      Domain           Phylum              Class
## 1      DESeq2_P1 d__Bacteria  p__Firmicutes_A      c__Clostridia
## 2      DESeq2_P1 d__Bacteria  p__Firmicutes_A      c__Clostridia
## 3      DESeq2_P1 d__Bacteria  p__Firmicutes_C   c__Negativicutes
## 4      DESeq2_P2 d__Bacteria  p__unclassified    c__unclassified
## 5      DESeq2_P2  d__Archaea p__Euryarchaeota c__Methanobacteria
## 6      DESeq2_P3 d__Bacteria  p__unclassified    c__unclassified
## 7      DESeq2_P3 d__Bacteria  p__Bacteroidota     c__Bacteroidia
## 8      DESeq2_P3 d__Bacteria    p__Firmicutes         c__Bacilli
## 9      DESeq2_P3 d__Bacteria  p__Firmicutes_A      c__Clostridia
## 10    DESeq2_All d__Bacteria    p__Firmicutes         c__Bacilli
## 11 DESeq2_P3+All d__Bacteria  p__Firmicutes_A      c__Clostridia
## 12  DESeq2_P1+P3 d__Bacteria  p__Bacteroidota     c__Bacteroidia
##                    Order                 Family                   Genus
## 1     o__Oscillospirales     f__Ruminococcaceae     g__Faecalibacterium
## 2        o__unclassified        f__unclassified         g__unclassified
## 3      o__Veillonellales      f__Dialisteraceae            g__Dialister
## 4        o__unclassified        f__unclassified         g__unclassified
## 5  o__Methanobacteriales f__Methanobacteriaceae g__Methanobrevibacter_A
## 6        o__unclassified        f__unclassified         g__unclassified
## 7       o__Bacteroidales      f__Bacteroidaceae          g__Bacteroides
## 8     o__Lactobacillales    f__Streptococcaceae          g__Lactococcus
## 9      o__Lachnospirales     f__Lachnospiraceae            g__Blautia_A
## 10 o__Erysipelotrichales f__Erysipelotrichaceae         g__Holdemanella
## 11    o__Oscillospirales     f__Ruminococcaceae     g__Faecalibacterium
## 12      o__Bacteroidales     f__Barnesiellaceae          g__Barnesiella
##                             Species          Strain
## 1                   s__unclassified t__unclassified
## 2                   s__unclassified t__unclassified
## 3                   s__PROV_t__4989         t__4989
## 4                   s__unclassified t__unclassified
## 5                   s__unclassified t__unclassified
## 6                   s__unclassified t__unclassified
## 7      s__Bacteroides__intestinalis t__unclassified
## 8                   s__unclassified t__unclassified
## 9                   s__unclassified t__unclassified
## 10        s__Holdemanella__biformis       t__262551
## 11                  s__unclassified t__unclassified
## 12 s__Barnesiella__intestinihominis        t__21316
#MTG results 

MtgP1<-tibble(rownames(zig_res_P1[[2]]), rep("Mtg_P1", length(rownames(zig_res_P1[[2]]))))
colnames(MtgP1) <- c("ASV", "Method+Data")

MtgP2<-tibble(rownames(zig_res_P2[[2]]), rep("Mtg_P2", length(rownames(zig_res_P2[[2]]))))
colnames(MtgP2) <- c("ASV", "Method+Data")

MtgP3<-tibble(rownames(zig_res_P3[[2]]), rep("Mtg_P3", length(rownames(zig_res_P3[[2]]))))
colnames(MtgP3) <- c("ASV", "Method+Data")

MtgAll<-tibble(metag_res[,1], rep("Mtg_All", length(metag_res[,1])))
colnames(MtgAll) <- c("ASV", "Method+Data")

Mtg_allsig<-rbind(MtgP1,MtgP2, MtgP3, MtgAll)

Mtgcount<-plyr::ddply(Mtg_allsig, "ASV", transform, count = length(ASV))
duplic<-tibble(Mtgcount$ASV[which(Mtgcount$count >= 2)], Mtgcount$Method.Data[which(Mtgcount$count >= 2)])


replace <-paste(duplic$`Mtgcount$Method.Data[which(Mtgcount$count >= 2)]`[which(duplic$`Mtgcount$ASV[which(Mtgcount$count >= 2)]` ==  unique(duplic$`Mtgcount$ASV[which(Mtgcount$count >= 2)]`)[1])], collapse = "")
tmp<-tibble(unique(duplic$`Mtgcount$ASV[which(Mtgcount$count >= 2)]`)[1], replace)
colnames(tmp) <- c("ASV", "Method+Data")
replacetab <-tmp

for (i in 2:length(unique(duplic$`Mtgcount$ASV[which(Mtgcount$count >= 2)]`))) {
  replace <-paste(duplic$`Mtgcount$Method.Data[which(Mtgcount$count >= 2)]`[which(duplic$`Mtgcount$ASV[which(Mtgcount$count >= 2)]` ==  unique(duplic$`Mtgcount$ASV[which(Mtgcount$count >= 2)]`)[i])], collapse = "")
  tmp<-tibble(unique(duplic$`Mtgcount$ASV[which(Mtgcount$count >= 2)]`)[i], replace)
  colnames(tmp) <- c("ASV", "Method+Data")
  replacetab<-rbind(replacetab, tmp)
}

Mtg_allsig<-Mtg_allsig[-which(Mtg_allsig$ASV %in% replacetab$ASV),]
Mtg_allsig<-rbind(Mtg_allsig, replacetab)

Mtg_allsig<-cbind(Mtg_allsig, as(tax_table(filtered_ps003)[Mtg_allsig$ASV, ], "matrix"))
saveRDS(Mtg_allsig, "metagenomseq//full_res_table_mtg")

Mtg_allsig.print <- Mtg_allsig
Mtg_allsig.print$ASV <- NULL
rownames(Mtg_allsig.print) <- NULL
Mtg_allsig.print
##            Method+Data      Domain               Phylum                  Class
## 1               Mtg_P1 d__Bacteria  p__Actinobacteriota      c__Actinobacteria
## 2               Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 3               Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 4               Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 5               Mtg_P1 d__Bacteria        p__Firmicutes             c__Bacilli
## 6               Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 7               Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 8               Mtg_P1 d__Bacteria        p__Firmicutes             c__Bacilli
## 9               Mtg_P1 d__Bacteria        p__Firmicutes             c__Bacilli
## 10              Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 11              Mtg_P2 d__Bacteria        p__Firmicutes             c__Bacilli
## 12              Mtg_P2 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 13              Mtg_P2 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 14              Mtg_P2 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 15              Mtg_P2 d__Bacteria        p__Firmicutes             c__Bacilli
## 16              Mtg_P2 d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 17              Mtg_P2 d__Bacteria      p__unclassified        c__unclassified
## 18              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 19              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 20              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 21              Mtg_P3 d__Bacteria        p__Firmicutes             c__Bacilli
## 22              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 23              Mtg_P3 d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 24              Mtg_P3 d__Bacteria        p__Firmicutes             c__Bacilli
## 25              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 26              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 27              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 28              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 29             Mtg_All d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 30             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 31             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 32             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 33             Mtg_All d__Bacteria p__Verrucomicrobiota    c__Verrucomicrobiae
## 34             Mtg_All d__Bacteria        p__Firmicutes             c__Bacilli
## 35             Mtg_All d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 36             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 37             Mtg_All d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 38             Mtg_All d__Bacteria    p__Proteobacteria c__Gammaproteobacteria
## 39             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 40             Mtg_All d__Bacteria        p__Firmicutes             c__Bacilli
## 41             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 42             Mtg_All d__Bacteria      p__Firmicutes_A        c__unclassified
## 43        Mtg_P2Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 44       Mtg_P3Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 45       Mtg_P1Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 46       Mtg_P1Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 47       Mtg_P1Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 48       Mtg_P3Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 49       Mtg_P1Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 50       Mtg_P2Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 51        Mtg_P1Mtg_P3 d__Bacteria      p__Firmicutes_C       c__Negativicutes
## 52       Mtg_P3Mtg_All d__Bacteria      p__Firmicutes_C       c__Negativicutes
## 53       Mtg_P3Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 54  Mtg_P1Mtg_P2Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 55       Mtg_P2Mtg_All d__Bacteria  p__Actinobacteriota      c__Coriobacteriia
## 56       Mtg_P3Mtg_All d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 57 Mtg_P2Mtg_P3Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 58       Mtg_P2Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
##                      Order                       Family
## 1       o__Actinomycetales        f__Bifidobacteriaceae
## 2        o__Lachnospirales           f__Lachnospiraceae
## 3        o__Lachnospirales           f__Lachnospiraceae
## 4        o__Lachnospirales           f__Lachnospiraceae
## 5       o__Lactobacillales          f__Streptococcaceae
## 6       o__Oscillospirales          f__Oscillospiraceae
## 7        o__Lachnospirales           f__Lachnospiraceae
## 8    o__Erysipelotrichales       f__Erysipelotrichaceae
## 9      o__Staphylococcales               f__Gemellaceae
## 10        o__Eubacteriales           f__Anaerofustaceae
## 11   o__Erysipelotrichales       f__Erysipelotrichaceae
## 12       o__Lachnospirales           f__Lachnospiraceae
## 13      o__Oscillospirales        f__Acutalibacteraceae
## 14       o__Lachnospirales           f__Lachnospiraceae
## 15   o__Erysipelotrichales       f__Erysipelotrichaceae
## 16        o__Bacteroidales            f__Bacteroidaceae
## 17         o__unclassified              f__unclassified
## 18      o__Oscillospirales           f__Ruminococcaceae
## 19       o__Lachnospirales           f__Lachnospiraceae
## 20       o__Lachnospirales           f__Lachnospiraceae
## 21      o__Lactobacillales          f__Streptococcaceae
## 22       o__Lachnospirales           f__Lachnospiraceae
## 23        o__Bacteroidales            f__Bacteroidaceae
## 24   o__Erysipelotrichales f__Erysipelatoclostridiaceae
## 25       o__Lachnospirales           f__Anaerotignaceae
## 26       o__Lachnospirales           f__Lachnospiraceae
## 27       o__Lachnospirales           f__Lachnospiraceae
## 28      o__Oscillospirales           f__Ruminococcaceae
## 29        o__Bacteroidales            f__Tannerellaceae
## 30      o__Oscillospirales        f__Acutalibacteraceae
## 31         o__unclassified              f__unclassified
## 32       o__Lachnospirales           f__Lachnospiraceae
## 33   o__Verrucomicrobiales           f__Akkermansiaceae
## 34      o__Lactobacillales          f__Streptococcaceae
## 35        o__Bacteroidales            f__Bacteroidaceae
## 36      o__Oscillospirales        f__Acutalibacteraceae
## 37        o__Bacteroidales            f__Bacteroidaceae
## 38     o__Enterobacterales           f__Pasteurellaceae
## 39         o__unclassified              f__unclassified
## 40      o__Haloplasmatales         f__Turicibacteraceae
## 41       o__Lachnospirales           f__Lachnospiraceae
## 42         o__unclassified              f__unclassified
## 43        o__Eubacteriales            f__Eubacteriaceae
## 44       o__Lachnospirales           f__Lachnospiraceae
## 45       o__Lachnospirales           f__Lachnospiraceae
## 46       o__Lachnospirales           f__Lachnospiraceae
## 47       o__Lachnospirales           f__Lachnospiraceae
## 48       o__Lachnospirales           f__Lachnospiraceae
## 49      o__Oscillospirales          f__Oscillospiraceae
## 50       o__Lachnospirales           f__Lachnospiraceae
## 51       o__Veillonellales           f__Veillonellaceae
## 52       o__Veillonellales           f__Veillonellaceae
## 53      o__Oscillospirales          f__Oscillospiraceae
## 54 o__Peptostreptococcales          f__Anaerovoracaceae
## 55     o__Coriobacteriales         f__Coriobacteriaceae
## 56        o__Bacteroidales            f__Marinifilaceae
## 57   o__Christensenellales       f__Christensenellaceae
## 58       o__Lachnospirales           f__Lachnospiraceae
##                        Genus                              Species
## 1         g__Bifidobacterium          s__Bifidobacterium__bifidum
## 2            g__unclassified                      s__unclassified
## 3            g__Anaerostipes              s__Anaerostipes__hadrus
## 4               g__Blautia_A                      s__unclassified
## 5           g__Streptococcus                      s__unclassified
## 6           g__Lawsonibacter                      s__unclassified
## 7           g__Clostridium_M      s__Clostridium_M__asparagiforme
## 8           g__Solobacterium             s__Solobacterium__moorei
## 9                 g__Gemella                      s__unclassified
## 10           g__Anaerofustis     s__Anaerofustis__stercorihominis
## 11               g__Absiella                      s__unclassified
## 12              g__Blautia_A                    s__PROV_t__165457
## 13          g__Acutalibacter         s__Acutalibacter__timonensis
## 14                g__Blautia                      s__unclassified
## 15             g__Holdemania                      s__unclassified
## 16            g__Bacteroides                      s__unclassified
## 17           g__unclassified                      s__unclassified
## 18       g__Faecalibacterium   s__Faecalibacterium__prausnitzii_K
## 19          g__Coprococcus_A              s__Coprococcus_A__catus
## 20           g__unclassified                      s__unclassified
## 21            g__Lactococcus                      s__unclassified
## 22           g__unclassified                      s__unclassified
## 23          g__Bacteroides_B           s__Bacteroides_B__vulgatus
## 24 g__Erysipelatoclostridium                      s__unclassified
## 25           g__Anaerotignum                      s__unclassified
## 26          g__Eubacterium_E                      s__unclassified
## 27         g__PROV_t__256727                    s__PROV_t__256727
## 28           g__unclassified                      s__unclassified
## 29        g__Parabacteroides           s__Parabacteroides__merdae
## 30           g__unclassified                      s__unclassified
## 31           g__unclassified                      s__unclassified
## 32           g__unclassified                      s__unclassified
## 33            g__Akkermansia          s__Akkermansia__muciniphila
## 34          g__Streptococcus                      s__unclassified
## 35            g__Bacteroides                      s__unclassified
## 36  g__Anaeromassilibacillus                    s__PROV_t__258798
## 37            g__Bacteroides     s__Bacteroides__thetaiotaomicron
## 38          g__Haemophilus_D                      s__unclassified
## 39           g__unclassified                      s__unclassified
## 40           g__Turicibacter                      s__unclassified
## 41              g__Blautia_A                    s__PROV_t__220541
## 42           g__unclassified                      s__unclassified
## 43            g__Eubacterium                      s__unclassified
## 44         g__Ruminococcus_A                      s__unclassified
## 45              g__Blautia_A                      s__unclassified
## 46              g__Blautia_A               s__Blautia_A__wexlerae
## 47         g__PROV_t__182732                      s__unclassified
## 48           g__unclassified                      s__unclassified
## 49         g__Intestinimonas s__Intestinimonas__butyriciproducens
## 50          g__Eubacterium_E           s__Eubacterium_E__hallii_A
## 51            g__Veillonella                      s__unclassified
## 52            g__Veillonella                      s__unclassified
## 53           g__unclassified                      s__unclassified
## 54           g__unclassified                      s__unclassified
## 55            g__Collinsella                      s__unclassified
## 56            g__Odoribacter         s__Odoribacter__splanchnicus
## 57         g__PROV_t__186843                    s__PROV_t__186843
## 58         g__PROV_t__209626                    s__PROV_t__209626
##             Strain
## 1  t__unclassified
## 2  t__unclassified
## 3        t__158681
## 4  t__unclassified
## 5  t__unclassified
## 6  t__unclassified
## 7  t__unclassified
## 8  t__unclassified
## 9  t__unclassified
## 10 t__unclassified
## 11 t__unclassified
## 12       t__165457
## 13       t__145645
## 14 t__unclassified
## 15 t__unclassified
## 16 t__unclassified
## 17 t__unclassified
## 18 t__unclassified
## 19        t__92557
## 20 t__unclassified
## 21 t__unclassified
## 22 t__unclassified
## 23        t__21615
## 24 t__unclassified
## 25 t__unclassified
## 26 t__unclassified
## 27       t__256727
## 28 t__unclassified
## 29 t__unclassified
## 30 t__unclassified
## 31 t__unclassified
## 32 t__unclassified
## 33 t__unclassified
## 34 t__unclassified
## 35 t__unclassified
## 36       t__258798
## 37 t__unclassified
## 38 t__unclassified
## 39 t__unclassified
## 40 t__unclassified
## 41       t__220541
## 42 t__unclassified
## 43 t__unclassified
## 44 t__unclassified
## 45 t__unclassified
## 46 t__unclassified
## 47 t__unclassified
## 48 t__unclassified
## 49 t__unclassified
## 50 t__unclassified
## 51 t__unclassified
## 52 t__unclassified
## 53 t__unclassified
## 54 t__unclassified
## 55 t__unclassified
## 56 t__unclassified
## 57       t__186843
## 58       t__209626
#None for ANCOM, so will concatenate mtg and des into one table

fullsigtab<-rbind(Mtg_allsig.print, deseq_allsig.print)
fullsigtab_esv<-rbind(Mtg_allsig, deseq_allsig)
fullsigtab
##            Method+Data      Domain               Phylum                  Class
## 1               Mtg_P1 d__Bacteria  p__Actinobacteriota      c__Actinobacteria
## 2               Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 3               Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 4               Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 5               Mtg_P1 d__Bacteria        p__Firmicutes             c__Bacilli
## 6               Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 7               Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 8               Mtg_P1 d__Bacteria        p__Firmicutes             c__Bacilli
## 9               Mtg_P1 d__Bacteria        p__Firmicutes             c__Bacilli
## 10              Mtg_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 11              Mtg_P2 d__Bacteria        p__Firmicutes             c__Bacilli
## 12              Mtg_P2 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 13              Mtg_P2 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 14              Mtg_P2 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 15              Mtg_P2 d__Bacteria        p__Firmicutes             c__Bacilli
## 16              Mtg_P2 d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 17              Mtg_P2 d__Bacteria      p__unclassified        c__unclassified
## 18              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 19              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 20              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 21              Mtg_P3 d__Bacteria        p__Firmicutes             c__Bacilli
## 22              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 23              Mtg_P3 d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 24              Mtg_P3 d__Bacteria        p__Firmicutes             c__Bacilli
## 25              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 26              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 27              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 28              Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 29             Mtg_All d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 30             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 31             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 32             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 33             Mtg_All d__Bacteria p__Verrucomicrobiota    c__Verrucomicrobiae
## 34             Mtg_All d__Bacteria        p__Firmicutes             c__Bacilli
## 35             Mtg_All d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 36             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 37             Mtg_All d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 38             Mtg_All d__Bacteria    p__Proteobacteria c__Gammaproteobacteria
## 39             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 40             Mtg_All d__Bacteria        p__Firmicutes             c__Bacilli
## 41             Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 42             Mtg_All d__Bacteria      p__Firmicutes_A        c__unclassified
## 43        Mtg_P2Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 44       Mtg_P3Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 45       Mtg_P1Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 46       Mtg_P1Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 47       Mtg_P1Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 48       Mtg_P3Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 49       Mtg_P1Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 50       Mtg_P2Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 51        Mtg_P1Mtg_P3 d__Bacteria      p__Firmicutes_C       c__Negativicutes
## 52       Mtg_P3Mtg_All d__Bacteria      p__Firmicutes_C       c__Negativicutes
## 53       Mtg_P3Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 54  Mtg_P1Mtg_P2Mtg_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 55       Mtg_P2Mtg_All d__Bacteria  p__Actinobacteriota      c__Coriobacteriia
## 56       Mtg_P3Mtg_All d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 57 Mtg_P2Mtg_P3Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 58       Mtg_P2Mtg_All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 59           DESeq2_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 60           DESeq2_P1 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 61           DESeq2_P1 d__Bacteria      p__Firmicutes_C       c__Negativicutes
## 62           DESeq2_P2 d__Bacteria      p__unclassified        c__unclassified
## 63           DESeq2_P2  d__Archaea     p__Euryarchaeota     c__Methanobacteria
## 64           DESeq2_P3 d__Bacteria      p__unclassified        c__unclassified
## 65           DESeq2_P3 d__Bacteria      p__Bacteroidota         c__Bacteroidia
## 66           DESeq2_P3 d__Bacteria        p__Firmicutes             c__Bacilli
## 67           DESeq2_P3 d__Bacteria      p__Firmicutes_A          c__Clostridia
## 68          DESeq2_All d__Bacteria        p__Firmicutes             c__Bacilli
## 69       DESeq2_P3+All d__Bacteria      p__Firmicutes_A          c__Clostridia
## 70        DESeq2_P1+P3 d__Bacteria      p__Bacteroidota         c__Bacteroidia
##                      Order                       Family
## 1       o__Actinomycetales        f__Bifidobacteriaceae
## 2        o__Lachnospirales           f__Lachnospiraceae
## 3        o__Lachnospirales           f__Lachnospiraceae
## 4        o__Lachnospirales           f__Lachnospiraceae
## 5       o__Lactobacillales          f__Streptococcaceae
## 6       o__Oscillospirales          f__Oscillospiraceae
## 7        o__Lachnospirales           f__Lachnospiraceae
## 8    o__Erysipelotrichales       f__Erysipelotrichaceae
## 9      o__Staphylococcales               f__Gemellaceae
## 10        o__Eubacteriales           f__Anaerofustaceae
## 11   o__Erysipelotrichales       f__Erysipelotrichaceae
## 12       o__Lachnospirales           f__Lachnospiraceae
## 13      o__Oscillospirales        f__Acutalibacteraceae
## 14       o__Lachnospirales           f__Lachnospiraceae
## 15   o__Erysipelotrichales       f__Erysipelotrichaceae
## 16        o__Bacteroidales            f__Bacteroidaceae
## 17         o__unclassified              f__unclassified
## 18      o__Oscillospirales           f__Ruminococcaceae
## 19       o__Lachnospirales           f__Lachnospiraceae
## 20       o__Lachnospirales           f__Lachnospiraceae
## 21      o__Lactobacillales          f__Streptococcaceae
## 22       o__Lachnospirales           f__Lachnospiraceae
## 23        o__Bacteroidales            f__Bacteroidaceae
## 24   o__Erysipelotrichales f__Erysipelatoclostridiaceae
## 25       o__Lachnospirales           f__Anaerotignaceae
## 26       o__Lachnospirales           f__Lachnospiraceae
## 27       o__Lachnospirales           f__Lachnospiraceae
## 28      o__Oscillospirales           f__Ruminococcaceae
## 29        o__Bacteroidales            f__Tannerellaceae
## 30      o__Oscillospirales        f__Acutalibacteraceae
## 31         o__unclassified              f__unclassified
## 32       o__Lachnospirales           f__Lachnospiraceae
## 33   o__Verrucomicrobiales           f__Akkermansiaceae
## 34      o__Lactobacillales          f__Streptococcaceae
## 35        o__Bacteroidales            f__Bacteroidaceae
## 36      o__Oscillospirales        f__Acutalibacteraceae
## 37        o__Bacteroidales            f__Bacteroidaceae
## 38     o__Enterobacterales           f__Pasteurellaceae
## 39         o__unclassified              f__unclassified
## 40      o__Haloplasmatales         f__Turicibacteraceae
## 41       o__Lachnospirales           f__Lachnospiraceae
## 42         o__unclassified              f__unclassified
## 43        o__Eubacteriales            f__Eubacteriaceae
## 44       o__Lachnospirales           f__Lachnospiraceae
## 45       o__Lachnospirales           f__Lachnospiraceae
## 46       o__Lachnospirales           f__Lachnospiraceae
## 47       o__Lachnospirales           f__Lachnospiraceae
## 48       o__Lachnospirales           f__Lachnospiraceae
## 49      o__Oscillospirales          f__Oscillospiraceae
## 50       o__Lachnospirales           f__Lachnospiraceae
## 51       o__Veillonellales           f__Veillonellaceae
## 52       o__Veillonellales           f__Veillonellaceae
## 53      o__Oscillospirales          f__Oscillospiraceae
## 54 o__Peptostreptococcales          f__Anaerovoracaceae
## 55     o__Coriobacteriales         f__Coriobacteriaceae
## 56        o__Bacteroidales            f__Marinifilaceae
## 57   o__Christensenellales       f__Christensenellaceae
## 58       o__Lachnospirales           f__Lachnospiraceae
## 59      o__Oscillospirales           f__Ruminococcaceae
## 60         o__unclassified              f__unclassified
## 61       o__Veillonellales            f__Dialisteraceae
## 62         o__unclassified              f__unclassified
## 63   o__Methanobacteriales       f__Methanobacteriaceae
## 64         o__unclassified              f__unclassified
## 65        o__Bacteroidales            f__Bacteroidaceae
## 66      o__Lactobacillales          f__Streptococcaceae
## 67       o__Lachnospirales           f__Lachnospiraceae
## 68   o__Erysipelotrichales       f__Erysipelotrichaceae
## 69      o__Oscillospirales           f__Ruminococcaceae
## 70        o__Bacteroidales           f__Barnesiellaceae
##                        Genus                              Species
## 1         g__Bifidobacterium          s__Bifidobacterium__bifidum
## 2            g__unclassified                      s__unclassified
## 3            g__Anaerostipes              s__Anaerostipes__hadrus
## 4               g__Blautia_A                      s__unclassified
## 5           g__Streptococcus                      s__unclassified
## 6           g__Lawsonibacter                      s__unclassified
## 7           g__Clostridium_M      s__Clostridium_M__asparagiforme
## 8           g__Solobacterium             s__Solobacterium__moorei
## 9                 g__Gemella                      s__unclassified
## 10           g__Anaerofustis     s__Anaerofustis__stercorihominis
## 11               g__Absiella                      s__unclassified
## 12              g__Blautia_A                    s__PROV_t__165457
## 13          g__Acutalibacter         s__Acutalibacter__timonensis
## 14                g__Blautia                      s__unclassified
## 15             g__Holdemania                      s__unclassified
## 16            g__Bacteroides                      s__unclassified
## 17           g__unclassified                      s__unclassified
## 18       g__Faecalibacterium   s__Faecalibacterium__prausnitzii_K
## 19          g__Coprococcus_A              s__Coprococcus_A__catus
## 20           g__unclassified                      s__unclassified
## 21            g__Lactococcus                      s__unclassified
## 22           g__unclassified                      s__unclassified
## 23          g__Bacteroides_B           s__Bacteroides_B__vulgatus
## 24 g__Erysipelatoclostridium                      s__unclassified
## 25           g__Anaerotignum                      s__unclassified
## 26          g__Eubacterium_E                      s__unclassified
## 27         g__PROV_t__256727                    s__PROV_t__256727
## 28           g__unclassified                      s__unclassified
## 29        g__Parabacteroides           s__Parabacteroides__merdae
## 30           g__unclassified                      s__unclassified
## 31           g__unclassified                      s__unclassified
## 32           g__unclassified                      s__unclassified
## 33            g__Akkermansia          s__Akkermansia__muciniphila
## 34          g__Streptococcus                      s__unclassified
## 35            g__Bacteroides                      s__unclassified
## 36  g__Anaeromassilibacillus                    s__PROV_t__258798
## 37            g__Bacteroides     s__Bacteroides__thetaiotaomicron
## 38          g__Haemophilus_D                      s__unclassified
## 39           g__unclassified                      s__unclassified
## 40           g__Turicibacter                      s__unclassified
## 41              g__Blautia_A                    s__PROV_t__220541
## 42           g__unclassified                      s__unclassified
## 43            g__Eubacterium                      s__unclassified
## 44         g__Ruminococcus_A                      s__unclassified
## 45              g__Blautia_A                      s__unclassified
## 46              g__Blautia_A               s__Blautia_A__wexlerae
## 47         g__PROV_t__182732                      s__unclassified
## 48           g__unclassified                      s__unclassified
## 49         g__Intestinimonas s__Intestinimonas__butyriciproducens
## 50          g__Eubacterium_E           s__Eubacterium_E__hallii_A
## 51            g__Veillonella                      s__unclassified
## 52            g__Veillonella                      s__unclassified
## 53           g__unclassified                      s__unclassified
## 54           g__unclassified                      s__unclassified
## 55            g__Collinsella                      s__unclassified
## 56            g__Odoribacter         s__Odoribacter__splanchnicus
## 57         g__PROV_t__186843                    s__PROV_t__186843
## 58         g__PROV_t__209626                    s__PROV_t__209626
## 59       g__Faecalibacterium                      s__unclassified
## 60           g__unclassified                      s__unclassified
## 61              g__Dialister                      s__PROV_t__4989
## 62           g__unclassified                      s__unclassified
## 63   g__Methanobrevibacter_A                      s__unclassified
## 64           g__unclassified                      s__unclassified
## 65            g__Bacteroides         s__Bacteroides__intestinalis
## 66            g__Lactococcus                      s__unclassified
## 67              g__Blautia_A                      s__unclassified
## 68           g__Holdemanella            s__Holdemanella__biformis
## 69       g__Faecalibacterium                      s__unclassified
## 70            g__Barnesiella     s__Barnesiella__intestinihominis
##             Strain
## 1  t__unclassified
## 2  t__unclassified
## 3        t__158681
## 4  t__unclassified
## 5  t__unclassified
## 6  t__unclassified
## 7  t__unclassified
## 8  t__unclassified
## 9  t__unclassified
## 10 t__unclassified
## 11 t__unclassified
## 12       t__165457
## 13       t__145645
## 14 t__unclassified
## 15 t__unclassified
## 16 t__unclassified
## 17 t__unclassified
## 18 t__unclassified
## 19        t__92557
## 20 t__unclassified
## 21 t__unclassified
## 22 t__unclassified
## 23        t__21615
## 24 t__unclassified
## 25 t__unclassified
## 26 t__unclassified
## 27       t__256727
## 28 t__unclassified
## 29 t__unclassified
## 30 t__unclassified
## 31 t__unclassified
## 32 t__unclassified
## 33 t__unclassified
## 34 t__unclassified
## 35 t__unclassified
## 36       t__258798
## 37 t__unclassified
## 38 t__unclassified
## 39 t__unclassified
## 40 t__unclassified
## 41       t__220541
## 42 t__unclassified
## 43 t__unclassified
## 44 t__unclassified
## 45 t__unclassified
## 46 t__unclassified
## 47 t__unclassified
## 48 t__unclassified
## 49 t__unclassified
## 50 t__unclassified
## 51 t__unclassified
## 52 t__unclassified
## 53 t__unclassified
## 54 t__unclassified
## 55 t__unclassified
## 56 t__unclassified
## 57       t__186843
## 58       t__209626
## 59 t__unclassified
## 60 t__unclassified
## 61         t__4989
## 62 t__unclassified
## 63 t__unclassified
## 64 t__unclassified
## 65 t__unclassified
## 66 t__unclassified
## 67 t__unclassified
## 68       t__262551
## 69 t__unclassified
## 70        t__21316

4.5 Visualization of significant taxa

### functions to plot
make_vis_plots <- function(ps_norm, grouping, tax, plot_type=c('box', 'bar')){
  # ps should be a normalized (DESeq or CSS) phyloseq object
  # grouping should match the column name in the sample_data
  # tax is a taxonomical bin id (ASV) in the counts table to plot
  
  # subset phyloseq object to select ASV of interest
  ps_filt=prune_taxa(taxa_names(ps_norm) %in% tax, ps_norm)
  
  # get normalized counts
  plot_table<-data.table(otu_table(ps_filt), keep.rownames=TRUE)[rn %in% tax]
  # add very small value, min/100000 to 0
  plot_table <- melt(plot_table, id.vars='rn')
  plot_table$value <- plot_table$value+min(plot_table[value!=0]$value)/100000
  
  # add metadata
  groupDT=data.table(data.frame(sample_data(ps_filt)[, c(grouping, 'Within.study.sampling.date')]), keep.rownames=TRUE)
  setnames(groupDT, 'rn', 'variable')
  plot_table <- merge(plot_table, groupDT, by='variable', all.x=TRUE)
  
  # change variable to general name
  setnames(plot_table, grouping, 'Group')

  # boxplot
  if(plot_type=='box'){
    ggplot(data=plot_table, aes(x=Within.study.sampling.date, y = value, fill=Group)) + 
      geom_boxplot(outlier.color=NA) +
      geom_jitter(position=position_jitterdodge(0.2), cex=1.5, color="gray44") + 
      labs(title =deparse(substitute(ps_norm)), x='', y ='Proportional counts, log scale') + 
      scale_y_log10() + 
      scale_fill_manual(values=sgColorPalette)+
      theme_minimal() + facet_wrap(~rn, scales='free', ncol=3)+
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  } else if (plot_type=='bar'){
    plot_table2 <- plot_table[, list(mean_ct=mean(value), sem=sd(value)/sqrt(.N)), by=c('Group', 'Within.study.sampling.date', 'rn')]
    ggplot(data=plot_table2, aes(x=Within.study.sampling.date, y =mean_ct, fill=Group)) + 
      geom_bar(stat='identity', position=position_dodge()) +
      geom_errorbar(aes(ymin=mean_ct-sem, ymax=mean_ct+sem), width=0.2, position=position_dodge(0.9))+ 
      labs(title =deparse(substitute(ps_norm)), x='', y ='Proportional counts, 0 to 1 scale') + 
      #scale_y_log10() + 
      scale_fill_manual(values=sgColorPalette)+
      theme_minimal() + facet_wrap(~rn, scales='free', ncol=3)+
      theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))
  }
}
######BOXPLOT of significant ones
# make significant taxa into one table so that all pvalues retained
significant_tax=NULL
significant_tax <- merge(data.table(deseq_res_P1[[2]], keep.rownames=TRUE)[, list(rn, deseq_P1_adjp=padj)],
                         data.table(deseq_res_P2[[2]], keep.rownames=TRUE)[, list(rn, deseq_P2_adjp=padj)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(deseq_res_P3[[2]], keep.rownames=TRUE)[, list(rn, deseq_P3_adjp=padj)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(bla[[2]], keep.rownames=TRUE)[, list(rn, deseq_timeseries_adjp=padj)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(zig_res_P1[[2]], keep.rownames=TRUE)[, list(rn, mtgseq_P1_adjp=adjPvalues)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(zig_res_P2[[2]], keep.rownames=TRUE)[, list(rn, mtgseq_P2_adjp=adjPvalues)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(zig_res_P3[[2]], keep.rownames=TRUE)[, list(rn, mtgseq_P3_adjp=adjPvalues)],
                         by='rn', all=TRUE)
significant_tax <- merge(significant_tax,
                         data.table(zig_res_all[[2]], keep.rownames=TRUE)[, list(rn, mtgseq_timeseries_adjp=adjPvalues)],
                         by='rn', all=TRUE)

# remove nothing
significant_tax <- significant_tax[rn!='nothing']

# write results
write.csv(significant_tax, file=paste0(output_data, 'Significant_res_deseq_q', deseq_cut, '_mtgseq_q', mtgseq_cut, '.csv'), row.names=FALSE)
          
datatable(significant_tax)
# also, find taxonomical annotations
# NOTE: single ASV may have multiple annotations due to tie hits
#Changing var all_tax_data to tax_table since I don't have this object since I don't have all_tax_data as a object,
datatable(tax_table(ps_not_norm_comp)[rownames(tax_table(ps_not_norm_comp)) %in% significant_tax$rn])

4.5.1 DESeq results

## plot
# common by deseq
com_deseq_taxa=significant_tax[!is.na(deseq_P1_adjp) & !is.na(deseq_P2_adjp) & !is.na(deseq_P3_adjp)]

if(nrow(com_deseq_taxa)>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', com_deseq_taxa$rn, 'box'))
} else {
  print('no common DESeq significant taxa')
}
## [1] "no common DESeq significant taxa"

4.5.2 DESeq timeseries results

# deseq timeseries
if(nrow(significant_tax[!is.na(deseq_timeseries_adjp)])>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(deseq_timeseries_adjp)]$rn, 'box'))
  # plot bar as well
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(deseq_timeseries_adjp)]$rn, 'bar'))
} else {
  print('no DESeq timeseries significant taxa')
}

4.5.3 metagenomeSeq results

# common by metagenomeseq
com_mtgseq_taxa=significant_tax[!is.na(mtgseq_P1_adjp) & !is.na(mtgseq_P2_adjp) & !is.na(mtgseq_P3_adjp)]

if(nrow(com_mtgseq_taxa)>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', com_mtgseq_taxa$rn, 'box'))
} else {
  print('no common metagenomeSeq significant taxa')
}

### Meta_genome timeseries results

# mtgseq timeseries
if(nrow(significant_tax[!is.na(mtgseq_timeseries_adjp)])>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(mtgseq_timeseries_adjp)]$rn[1:6], 'box'))
  # plot bar as well
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(mtgseq_timeseries_adjp)]$rn[1:6], 'bar'))
} else {
  print('no Mtgseq timeseries significant taxa')
}

if(nrow(significant_tax[!is.na(mtgseq_timeseries_adjp)])>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(mtgseq_timeseries_adjp)]$rn[7:12], 'box'))
  # plot bar as well
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(mtgseq_timeseries_adjp)]$rn[7:12], 'bar'))
} else {
  print('no Mtgseq timeseries significant taxa')
}

if(nrow(significant_tax[!is.na(mtgseq_timeseries_adjp)])>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(mtgseq_timeseries_adjp)]$rn[13:18], 'box'))
  # plot bar as well
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(mtgseq_timeseries_adjp)]$rn[13:18], 'bar'))
} else {
  print('no Mtgseq timeseries significant taxa')
}

if(nrow(significant_tax[!is.na(mtgseq_timeseries_adjp)])>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(mtgseq_timeseries_adjp)]$rn[19:24], 'box'))
  # plot bar as well
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(mtgseq_timeseries_adjp)]$rn[19:24], 'bar'))
} else {
  print('no Mtgseq timeseries significant taxa')
}

if(nrow(significant_tax[!is.na(mtgseq_timeseries_adjp)])>0){
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(mtgseq_timeseries_adjp)]$rn[25:27], 'box'))
  # plot bar as well
  print(make_vis_plots(ps_TSS_norm_pass_min_postDD_sup003, 'phenotype', significant_tax[!is.na(mtgseq_timeseries_adjp)]$rn[25:27], 'bar'))
} else {
  print('no Mtgseq timeseries significant taxa')
}

5 CCA and visualization

Compare resulting amplicon data between and within sample types by canonical correlation analysis, regression profiling, and visualization (e.g. non-metric multi-dimensional scaling [NMDS], principle coordinates of analysis, principle component analysis).

5.1 Constrained by status A or N

plotting_phenotype_consPcoA <- function(ps,title){
  fam_6<-names(table(sample_data(ps)$Family.group.ID)[table(sample_data(ps)$Family.group.ID) == 6])
  ps_6fam<-prune_samples(sample_data(ps)$Family.group.ID %in% fam_6,ps )
  ps_pcoa_ord <- ordinate(
    physeq = ps_6fam, 
    method = "CAP", 
    distance = "bray",
    formula = ~ phenotype
    )
  p<-plot_ordination(
    physeq = ps_6fam, 
    ordination = ps_pcoa_ord, 
    color = "phenotype", 
    axes = c(1,2),
    title= paste("Constrained PcoA",title,"ordinated by phenotype with all timepoints")
    ) + 
    geom_point( size = 2) +
    scale_color_manual(values=sgColorPalette)+
    theme_minimal()+
    theme(text = element_text(size =10), plot.title = element_text(size=10))
  
    #sum_pcoA_DesEq<-summary(ps_pcoa_ord)
    erie_bray_sum_pcoA <- phyloseq::distance(ps, method = "bray")
    sampledf <- data.frame(sample_data(ps))
    beta_di<-betadisper(erie_bray_sum_pcoA, sampledf$Family.group.ID)
to_return<-list()
to_return[[1]]<-p
to_return[[2]]<-beta_di
  return(to_return)
  }

#With Deseq
DeSeq_distance<-plotting_phenotype_consPcoA(ps_DeSeq_norm_pass_min_postDD_sup003, "Deseq")
# plot
DeSeq_distance[[1]]

#same with CSS 
CSS_distance<-plotting_phenotype_consPcoA(ps_CSS_norm_pass_min_postDD_sup003, "CSS")
# plot
CSS_distance[[1]]

5.2 Constrained by family

#plotting
#Now we have: 803 taxa and 559 samples

#Looking at the family fro the complete set of samples

#Keeping the same ordination but filtering to the families with only 6 point to help vizualize the plot

#Looking at NORMALIZATION
plotting_Fam_consPcoA <- function(ps,title){
  fam_6<-names(table(sample_data(ps)$Family.group.ID)[table(sample_data(ps)$Family.group.ID) == 6])
  ps_6fam<-prune_samples(sample_data(ps)$Family.group.ID %in% fam_6,ps )
  sample_data(ps_6fam)$Family.group.ID <- paste0('fam', as.character(sample_data(ps_6fam)$Family.group.ID))
  ps_pcoa_ord <- ordinate(
    physeq = ps_6fam, 
    method = "CAP", 
    distance = "bray",
    formula = ~ Family.group.ID
    )
  p<-plot_ordination(
    physeq = ps_6fam, 
    ordination = ps_pcoa_ord, 
    color = "Family.group.ID", 
    axes = c(1,2),
    title= paste("Constrained PcoA",title,"ordinated by families with all timepoints")
    ) + 
    geom_point( size = 2) +
    theme_minimal()+
    theme(text = element_text(size =10), plot.title = element_text(size=10), legend.position='none')
  
    #sum_pcoA_DesEq<-summary(ps_pcoa_ord)
    erie_bray_sum_pcoA <- phyloseq::distance(ps, method = "bray")
    sampledf <- data.frame(sample_data(ps))
    beta_di<-betadisper(erie_bray_sum_pcoA, sampledf$Family.group.ID)
to_return<-list()
to_return[[1]]<-p
to_return[[2]]<-beta_di
  return(to_return)
  }

#With Deseq
DeSeq_distance<-plotting_Fam_consPcoA(ps_DeSeq_norm_pass_min_postDD_sup003, "Deseq")
# plot
DeSeq_distance[[1]]

#same with CSS 
CSS_distance<-plotting_Fam_consPcoA(ps_CSS_norm_pass_min_postDD_sup003, "CSS")
# plot
CSS_distance[[1]]

#the distance in those plot?
#average_distance_to_median
#pdf(file=paste0(output_data, "Figures/Distance_DeSeq_CSS_", Sys.Date(), ".pdf"))
boxplot(DeSeq_distance[[2]]$distances,CSS_distance[[2]]$distances, names=c("DeSeq", "CSS"), 
        xlab = "Type of Normalization", ylab = "Distance on Component 1 & 2", main ="Intragroup distance for each family")

#dev.off()

6 Diversity

Characterize and assess the diversity of each sample, and evaluate the extent of dissimilarity between the cohorts

6.1 Alpha diversity

ER <- estimate_richness(ps_not_norm_comp, measures=c("Observed", "Chao1", "Shannon"))
ER <- cbind(ER, sample_data(ps_not_norm_comp)[row.names(ER), c("phenotype", "Family.group.ID", "Within.study.sampling.date")])
ER <- data.table(ER, keep.rownames = TRUE)
ER <- melt(ER, id.vars=c('rn', 'phenotype', "Family.group.ID", "Within.study.sampling.date"))

# plot
ggplot(data=ER[variable!='se.chao1'], aes(x=phenotype, y=value, fill=phenotype))+
  geom_boxplot(width=0.7, outlier.colour='white')+
  geom_jitter(size=1, position=position_jitter(width=0.1))+
  xlab('')+ylab('')+
  scale_fill_manual(values=sgColorPalette)+
  theme_minimal()+facet_wrap(~variable, scales='free')

# run t-test to check significance
ttest=NULL
for(alphad in c('Observed', 'Chao1', 'Shannon')){
  ttest_res=t.test(value ~ phenotype, data=ER[variable==alphad], var.equal=TRUE)
  ttest=rbindlist(list(ttest, data.table(alpha_index=alphad, pvalue=ttest_res$p.value)))
}

pander(ttest)
alpha_index pvalue
Observed 0.2772
Chao1 0.2772
Shannon 0.5052

6.2 Beta diversity

#Let's do a PcoA #not much differences 
GP.ord <- ordinate(ps_DeSeq_norm_pass_min_postDD_sup003, "PCoA", "bray")
p2 = plot_ordination(ps_DeSeq_norm_pass_min_postDD_sup003, GP.ord, type="samples", color="phenotype") +
  geom_point( size = 1)+
  scale_color_manual(values=sgColorPalette)+
  theme_minimal()
p2

7 PERMANOVA

non- parametric statistical approaches (ANOSIM, ADONIS, ANOVA, PERMANOVA, etc.) will be employed to determine the significance of noteworthy factors, such as digital phenotype, probiotic and/or antibiotic use

7.1 PERMANOVA test

permanova <- function(physeq, factorName, ifnumeric, pmt=999){
  set.seed(1)
  braydist = phyloseq::distance(physeq, "bray")
  form <- as.formula(paste("braydist ~ ", c(factorName), sep = ""))
  metaDF=data.frame(sample_data(physeq)[, as.character(factorName)])
  # if numerical variable, make sure the class
  if(ifnumeric){
    metaDF[, factorName] <- as.numeric(metaDF[, factorName])
    factor_class='numeric'
  } else {
    factor_class='categorical'
  }
  perm <- adonis(form, permutations = pmt, metaDF)
  permDT=data.table(Variable=factorName, 
             FactorClass=factor_class,
             TotalN=perm$aov.tab['Total','Df']+1, 
             R2=perm$aov.tab[factorName, 'R2'], 
             pvalue=perm$aov.tab[factorName,'Pr(>F)'][1])
  return(permDT)
}
#betadispersion
#we keep only the cateory selected above as relevant 
tmp_metadat<-metadata_ok[,c(num_cat,fac_cat)]
#additionnal error to remove: not enough sample: 
tmp_metadat<-tmp_metadat[,-which(colnames(tmp_metadat) %in% c("Number.of.pet.reptiles","Number.of.pet.horses", "Pet.horse"))]
#additionnal error to remove: filled with only NA or one factor, cant do permutest on it due to adonis function requirements
col_levels<-sapply(tmp_metadat, levels)
col_levelscount<-sapply(col_levels, length)
tmp_metadat_1 <- tmp_metadat
#Since there are no numerics based on code below, will drop all that dont have 2 or more levels
#tmp_metadat[,which(sapply(tmp_metadat, class) == "numeric")]
tmp_metadat <- tmp_metadat[,which(col_levelscount >= 2)]

set.seed(1)
pval_factors_diper=c()
nb_samples_disper=c()
for (i in 1:length(tmp_metadat)){
  #cat (i,"\t")
  test_map<-tmp_metadat[!is.na(tmp_metadat[,i]) & tmp_metadat[,i] != "" ,]
  ps.tmp<-copy(ps_DeSeq_norm_pass_min_postDD_sup003)
  sample_data(ps.tmp) <- test_map
  df_metadata <- data.frame(sample_data(ps.tmp))
  df_metadata<-df_metadata[df_metadata[,colnames(test_map)[i]] != "",]
  # use prune_samples instead of subset_samples
  keepid=!is.na(get_variable(ps.tmp, colnames(test_map)[i])) &
    get_variable(ps.tmp, colnames(test_map)[i])!='' &
    get_variable(ps.tmp, colnames(test_map)[i])!='NA' 
  ps.tmp <- prune_samples(keepid, ps.tmp)
  #ps.tmp <- subset_samples(ps.tmp, colnames(test_map)[i] !="")
  tmp_nb_samples<-dim(otu_table(ps.tmp))[2]
  OTU_tables_bray <- phyloseq::distance(ps.tmp, method = "bray")
  beta <- betadisper(OTU_tables_bray, df_metadata[,colnames(test_map)[i]])
  tmp<-permutest(beta)
  tmp<-tmp$tab$`Pr(>F)`[1]
  pval_factors_diper<-c(pval_factors_diper,tmp)
  nb_samples_disper<-c(nb_samples_disper,tmp_nb_samples)}
#correct the p.value 
names(pval_factors_diper)<-colnames(tmp_metadat)
pval_factors_diper<-p.adjust(pval_factors_diper, method = "fdr")
to_remove_betadis<-names(pval_factors_diper)[pval_factors_diper<0.05]
  
# list of permanova variables
#meta_cat <- tibble(col_levelscount >= 2, colnames(tmp_metadat_1), sapply(tmp_metadat_1, class))
#rownames(meta_cat) <- colnames(tmp_metadat_1)
#colnames(meta_cat) <- c("permanova", "varname", "type")
#meta_cat$type <- gsub("factor", "Categorical", meta_cat$type)
#meta_cat$type <- gsub("numerical", "Continuous", meta_cat$type)

#meta_cat file listed phenotype as false for permanova, but I will add it back in)
meta_cat$permanova[which(meta_cat$varname == "phenotype")] <- "Categorical"
permanova_var=meta_cat[which(meta_cat$permanova!=FALSE),]

permanova_var$permanova[which(permanova_var$varname %in% c(dict_1_items, dict_2_items, "Stool.frequency"))] <- rep("Continuous", length(permanova_var$permanova[which(permanova_var$varname %in% c(dict_1_items, dict_2_items, "Stool.frequency"))]))

set.seed(1)
permanova_res=NULL
for(j in 1:nrow(permanova_var)){
    #print(factorName1)
    #pander(table(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)[, factorName1]))
  # variable name (as.characteradded)
  var_name=as.character(permanova_var$varname[j])
  # remove all NAs
  keepid=!is.na(get_variable(ps_DeSeq_norm_pass_min_postDD_sup003, var_name)) &
    get_variable(ps_DeSeq_norm_pass_min_postDD_sup003, var_name)!='NA' &
    get_variable(ps_DeSeq_norm_pass_min_postDD_sup003, var_name)!=''
  tmp_ps <- prune_samples(keepid, ps_DeSeq_norm_pass_min_postDD_sup003)
  
  # Check if there is more than 1 values (categories)
  if(uniqueN(sample_data(tmp_ps)[, var_name])>1){
    
    # if categorical
    if(permanova_var$permanova[j]=='Categorical'){
      # run permanova only if there are more than 1 groups
      p <- permanova(tmp_ps, factorName=var_name, ifnumeric=FALSE, pmt=999)
      permanova_res=rbindlist(list(permanova_res, p))
      rm(p)
    }
    # if continuous
    if(permanova_var$permanova[j]=='Continuous'){
      p <- permanova(tmp_ps, factorName=var_name, ifnumeric=TRUE, pmt=999)
      permanova_res=rbindlist(list(permanova_res, p))
      rm(p)
    }
  }
  rm(var_name)
}

# write
write.csv(permanova_res, file=paste0(output_data, 'PERMANOVA.csv'), row.names=FALSE)

# total number of variables tested
uniqueN(permanova_res$Variable)

[1] 123

# Factor class
pander(table(permanova_res$FactorClass))
categorical numeric
91 32
# number of significant variables
uniqueN(permanova_res[pvalue<permanova_pcut]$Variable)

[1] 113

#and now removing the ones with betadispersion significant 
impacting_compo<-setdiff(permanova_res[pvalue<permanova_pcut]$Variable,  to_remove_betadis)

#and now the ones also significant between the two cohorts
impacting_compo<-impacting_compo[impacting_compo %in% c(names(all_chisquare))]
permanova_res<- permanova_res[permanova_res$Variable %in% impacting_compo,]

#removing LR predictions since those are essentially an indicator of phenotype and not confounding variables
permanova_res<-permanova_res[-which(permanova_res$Variable %in% c("LR10.probability.ASD..M3.", "LR5.probability.ASD..M3.", "LR6.probability.ASD..M3." , "LR10.prediction..M3.", "LR10.probability.not.ASD..M3.", "LR5.probability.not.ASD..M3." , "LR5.prediction..M3.", "LR6.prediction..M3." , "LR6.probability.not.ASD..M3.")),]

# sort
permanova_res <- permanova_res[order(R2, decreasing=TRUE)]
datatable(permanova_res)
write.csv(permanova_res, file=paste0(output_data, 'PERMANOV_betadis_imp_corhort.csv'), row.names=FALSE)

7.2 Visualize permanova significant variables on PCoA

# function to plot PCoA, only for higher R2 value 
imp_factors<-permanova_res$Variable[permanova_res$R2 > 0.01]

imp<-list()
for (i in 1:length(imp_factors)){
if(anyNA(map[,imp_factors[i]]) == FALSE){
  imp[i] <- imp_factors[i]
}
}


impforpcoa<-unlist(imp)

add<-paste(impforpcoa, collapse = " + ")

#copy and paste form variable below into formula for pcoa for convenience (not sure why it does not work as an input for formula, but copy/paste as text works)
form<- as.formula(paste0("~ ", add))


#ordination formula only working with one variable in formula...
ps_pcoa <- ordinate(
  physeq = ps_DeSeq_norm_pass_min_postDD_sup003, 
  method = "CAP", 
  distance = "bray",
  #Did not include Toilet.cover and Meat/Seafood Longitudinal, Fruit..consumption.frequency...longitudinal. and LR10.prediction..M3. due to NA missing values which does not allow for ordination
  formula = ~Stool.frequency + Age..months. + Age..years. + Vitamin.D..consumption.frequency.)
title_prep<-impforpcoa

to_plot=list()
for (i in 1:4){
  to_plot[[i]] <- plot_ordination(
  physeq = ps_DeSeq_norm_pass_min_postDD_sup003, 
  ordination = ps_pcoa, 
  color = title_prep[i], 
  axes = c(1,2),
  title=title_prep[i]
) + 
  geom_point( size = 0.5) +
  theme(text = element_text(size =20), plot.title = element_text(size=15))
}
to_plot[[5]]<-plot_ordination(physeq = ps_DeSeq_norm_pass_min_postDD_sup003, ordination = ps_pcoa, type="taxa",title ="Taxa") + theme(text = element_text(size =15))
lay <- rbind(c(1),
             c(2),
             c(3),
             c(4),
             c(5))


#pdf(paste0(output_data,"confounding_factors.pdf",width=16,height=40))
grid.arrange(grobs = to_plot, layout_matrix = lay)

top_potential_confounds <- imp_factors
#dev.off()
#Let's have a look at the plot 
plot_ordination(physeq = ps_DeSeq_norm_pass_min_postDD_sup003, ordination = ps_pcoa, type="taxa",title ="Taxa") + theme(text = element_text(size =8))

#ok let's try to find the spcies that show some importance in this PCA
taxa.to.select<-vegan::scores(ps_pcoa)$species
#now plot it with no name for visibilty
rownames(taxa.to.select)<-c()
s.arrow(taxa.to.select) #the taxa that influence the most the plots are above 0.25

taxa.to.select.to.rem<-vegan::scores(ps_pcoa)$species[abs(vegan::scores(ps_pcoa)$species[,1])>0.1 | abs(vegan::scores(ps_pcoa)$species[,2])>0.1,]
#any overlap with the 5 important? 
rownames(bla[[2]]) %in% taxa.to.select.to.rem #NOPE!
## [1] FALSE FALSE

7.3 Comparison of Variance between Subjects and between Phenotypes

#Comparing variance w/ avg difference in distance
tmpps <- prune_samples((ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Family.group.ID == "1"), ps_DeSeq_norm_pass_min_postDD_sup003)
tmppsA <- prune_samples(tmpps@sam_data$phenotype == "A", tmpps)
tmppsN <- prune_samples(tmpps@sam_data$phenotype == "N", tmpps)

A<-distance(tmppsA, "bray", type = "samples")
ave_distanceA=ave(c(A[1],A[2],A[3]))[1]

N<-distance(tmppsN, "bray", type = "samples")
ave_distanceN=ave(c(N[1],N[2],N[3]))[1]

tab<-tibble(ave_distanceA, ave_distanceN, tmpps@sam_data$Family.group.ID[1])
colnames(tab) <- c("AvgDistanceAut", "AvgDistanceNeu", "Family")


for(i in unique(ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Family.group.ID)){
tmpps <- prune_samples((ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Family.group.ID == i), ps_DeSeq_norm_pass_min_postDD_sup003)
tmppsA <- prune_samples(tmpps@sam_data$phenotype == "A", tmpps)
tmppsN <- prune_samples(tmpps@sam_data$phenotype == "N", tmpps)

A<-distance(tmppsA, "bray", type = "samples")
ave_distanceA=ave(c(A[1],A[2],A[3]))[1]

N<-distance(tmppsN, "bray", type = "samples")
ave_distanceN=ave(c(N[1],N[2],N[3]))[1]

tabtmp<-tibble(ave_distanceA, ave_distanceN, tmpps@sam_data$Family.group.ID[1])
colnames(tabtmp) <- c("AvgDistanceAut", "AvgDistanceNeu", "Family")

tab<-rbind(tab, tabtmp)
}


# run tests to check significance
taba<-tibble(tab$AvgDistanceAut, rep("A", length(tab$AvgDistanceAut)), tab$Family)
colnames(taba) <- c("AvgDistanceDiff_btwnTimepoints","phenotype", "Family" )

tabn<-tibble(tab$AvgDistanceNeu, rep("N", length(tab$AvgDistanceNeu)), tab$Family)
colnames(tabn) <- c("AvgDistanceDiff_btwnTimepoints","phenotype" , "Family")

finaltab2<-rbind(tabn, taba)

p <- ggplot(finaltab2, aes(x=phenotype, y=AvgDistanceDiff_btwnTimepoints)) + 
  geom_boxplot()
p + geom_jitter(shape=16, position=position_jitter(0.2))

# run tests to check significance
shapiro.test(finaltab2$AvgDistanceDiff_btwnTimepoints) #not normal we need a reanking test
## 
##  Shapiro-Wilk normality test
## 
## data:  finaltab2$AvgDistanceDiff_btwnTimepoints
## W = 0.87065, p-value = 3.487e-09
wilcox.test(AvgDistanceDiff_btwnTimepoints ~ phenotype, data=finaltab2, var.equal=FALSE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  AvgDistanceDiff_btwnTimepoints by phenotype
## W = 1948, p-value = 0.6354
## alternative hypothesis: true location shift is not equal to 0
#not significant


#paired wilcoxon
tmpps <- prune_samples((ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Family.group.ID == "1"), ps_DeSeq_norm_pass_min_postDD_sup003)
tmppsA <- prune_samples(tmpps@sam_data$phenotype == "A", tmpps)
tmppsN <- prune_samples(tmpps@sam_data$phenotype == "N", tmpps)

A<-distance(tmppsA, "bray", type = "samples")
distanceA=c(A[1],A[2],A[3])

N<-distance(tmppsN, "bray", type = "samples")
distanceN=c(N[1],N[2],N[3])

tab<-tibble(distanceA, distanceN, rep(tmpps@sam_data$Family.group.ID[1], length(distanceA)), c("Timepoint 1-2", "Timepoint 1-3", "Timepoint 2-3"))
colnames(tab) <- c("DistanceAut", "DistanceNeu", "Family", "Timepoint")


for(i in unique(ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Family.group.ID)[c(-1)]){
tmpps <- prune_samples((ps_DeSeq_norm_pass_min_postDD_sup003@sam_data$Family.group.ID == i), ps_DeSeq_norm_pass_min_postDD_sup003)
tmppsA <- prune_samples(tmpps@sam_data$phenotype == "A", tmpps)
tmppsN <- prune_samples(tmpps@sam_data$phenotype == "N", tmpps)

A<-distance(tmppsA, "bray", type = "samples")
distanceA=c(A[1],A[2],A[3])

N<-distance(tmppsN, "bray", type = "samples")
distanceN=c(N[1],N[2],N[3])

tabtmp<-tibble(distanceA, distanceN, rep(tmpps@sam_data$Family.group.ID[1], length(distanceA)),  c("Timepoint 1-2", "Timepoint 1-3", "Timepoint 2-3"))
colnames(tabtmp) <- c("DistanceAut", "DistanceNeu", "Family", "Timepoint")

tab<-rbind(tab, tabtmp)
}
tab_w_tp <- tab

# run tests to check significance
taba<-tibble(tab$DistanceAut, rep("A", length(tab$DistanceAut)), tab$Family, tab$Timepoint)
colnames(taba) <- c("Distances_btwnTimepoints","phenotype", "Family" , "Timepoint")

tabn<-tibble(tab$DistanceNeu, rep("N", length(tab$DistanceNeu)), tab$Family, tab$Timepoint)
colnames(tabn) <- c("Distances_btwnTimepoints","phenotype", "Family" , "Timepoint")

finaltab3<-rbind(tabn, taba)

p <- ggplot(finaltab3, aes(x=phenotype, y=Distances_btwnTimepoints)) + 
  geom_boxplot()
p + geom_jitter(shape=16, position=position_jitter(0.2))

# run tests to check significance
wilcox.test(tab$DistanceAut, tab$DistanceNeu, paired = TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  tab$DistanceAut and tab$DistanceNeu
## V = 9608, p-value = 0.4028
## alternative hypothesis: true location shift is not equal to 0
#still not significant




#Will do permutations


#with avgs
permutation_meandist_gen<-function(x){
  ptab <- x
  ptab$permu_label<- ptab$phenotype[shuffle(ptab$phenotype)]

  mean(ptab$AvgDistanceDiff_btwnTimepoints[which(ptab$permu_label == "A")]) -
  mean(ptab$AvgDistanceDiff_btwnTimepoints[which(ptab$permu_label == "N")])

}
permu_means<-replicate(1000, permutation_meandist_gen(finaltab2))

diff.means<-mean(finaltab2$AvgDistanceDiff_btwnTimepoints[which(finaltab2$phenotype == "A")]) -
  mean(finaltab2$AvgDistanceDiff_btwnTimepoints[which(finaltab2$phenotype == "N")])

sig <- sum(permu_means > diff.means)

hist(permu_means) 

# with raw values
permutation_meandist_gen<-function(x){
  ptab <- x
  ptab$permu_label<- ptab$phenotype[shuffle(ptab$phenotype)]

  mean(ptab$Distances_btwnTimepoints[which(ptab$permu_label == "A")]) -
  mean(ptab$Distances_btwnTimepoints[which(ptab$permu_label == "N")])

}
set.seed(1)
permu_means<-replicate(1000, permutation_meandist_gen(finaltab3))

diff.means<-mean(finaltab3$Distances_btwnTimepoints[which(finaltab3$phenotype == "A")]) -
  mean(finaltab3$Distances_btwnTimepoints[which(finaltab3$phenotype == "N")])

sig <- as.numeric(sum(permu_means >= diff.means))
pval<-sig/1000

pval
## [1] 0.414
{hist(permu_means)
 abline(v = diff.means, col = "red")}

{plot(density(permu_means))
  abline(v = diff.means, col = "red")}

#organized by family

permutation_meandist_gen_by_fam<-function(x){
  ptab <- x
  tmp <- ptab[which(ptab$Family == ptab$Family[1]),]
  tmp$permu_label<- tmp$phenotype[shuffle(tmp$phenotype)]
  ptab_all <- tmp
  
  for (i in 2:length(unique(ptab$Family))){
  tmp <- ptab[which(ptab$Family == unique(ptab$Family)[i]),]
  tmp$permu_label<- tmp$phenotype[shuffle(tmp$phenotype)]
  ptab_all <- rbind(ptab_all, tmp)
  }
  

  mean(ptab_all$Distances_btwnTimepoints[which(ptab_all$permu_label == "A")]) -
  mean(ptab_all$Distances_btwnTimepoints[which(ptab_all$permu_label == "N")])

}
set.seed(1)
permu_means_by_fam<-replicate(1000, permutation_meandist_gen_by_fam(finaltab3))


sig <- as.numeric(sum(permu_means >= diff.means))
pval_by_fam<-sig/1000

pval_by_fam
## [1] 0.414
{hist(permu_means_by_fam)
 abline(v = diff.means, col = "red")}

{plot(density(permu_means_by_fam))
  abline(v = diff.means, col = "red")}

# Now with difference between A and N at each time point, then taking mean


#Since they are in order by family and timepoint, I can subtract across

#view to make sure
#finaltab3[which(finaltab3$phenotype == "A"),]
#finaltab3[which(finaltab3$phenotype == "N"),]



# First just by timepoint
permutation_meandist_gen_by_fam_diff<-function(x){
  ptab <- x
  tmp <- ptab[which(ptab$Family == ptab$Family[1]),]
  tmp$permu_label<- tmp$phenotype[shuffle(tmp$phenotype)]
  ptab_all <- tmp
  
  for (i in 2:length(unique(ptab$Family))){
  tmp <- ptab[which(ptab$Family == unique(ptab$Family)[i]),]
  tmp$permu_label<- tmp$phenotype[shuffle(tmp$phenotype)]
  ptab_all <- rbind(ptab_all, tmp)
  }
  

  mean(ptab_all$Distances_btwnTimepoints[which(ptab_all$permu_label == "A")]-
  ptab_all$Distances_btwnTimepoints[which(ptab_all$permu_label == "N")])

}

fintab_1_2<-finaltab3[which(finaltab3$Timepoint == "Timepoint 1-2"),]
fintab_1_3<-finaltab3[which(finaltab3$Timepoint == "Timepoint 1-3"),]
fintab_2_3<-finaltab3[which(finaltab3$Timepoint == "Timepoint 2-3"),]

set.seed(1)
permu_means_by_fam12<-replicate(1000, permutation_meandist_gen_by_fam_diff(fintab_1_2))
mean_difference<-mean(fintab_1_2$Distances_btwnTimepoints[which(fintab_1_2$phenotype == "A")] -fintab_1_2$Distances_btwnTimepoints[which(fintab_1_2$phenotype == "N")])
sig <- as.numeric(sum(permu_means_by_fam12 >= mean_difference))
pval_by_fam12<-sig/1000

set.seed(1)
permu_means_by_fam13<-replicate(1000, permutation_meandist_gen_by_fam_diff(fintab_1_3))
mean_difference<-mean(fintab_1_3$Distances_btwnTimepoints[which(fintab_1_3$phenotype == "A")] -fintab_1_3$Distances_btwnTimepoints[which(fintab_1_3$phenotype == "N")])
sig <- as.numeric(sum(permu_means_by_fam13 >= mean_difference))
pval_by_fam13<-sig/1000


set.seed(1)
permu_means_by_fam23<-replicate(1000, permutation_meandist_gen_by_fam_diff(fintab_2_3))
mean_difference<-mean(fintab_2_3$Distances_btwnTimepoints[which(fintab_2_3$phenotype == "A")] -fintab_2_3$Distances_btwnTimepoints[which(fintab_2_3$phenotype == "N")])
sig <- as.numeric(sum(permu_means_by_fam23 >= mean_difference))
pval_by_fam23<-sig/1000


#All_together
permutation_meandist_gen_by_fam_diff_all<-function(x){
  ptab <- x
  tmp <- ptab[which(ptab$Family == ptab$Family[1]),]
  tmp_time <- tmp[which(tmp$Timepoint == unique(tmp$Timepoint)[1]),]
  tmp_time$permu_label<- tmp_time$phenotype[shuffle(tmp_time$phenotype)]
  tmp_time_all <- tmp_time
  
  for (b in 2:3) {
    tmp_time <- tmp[which(tmp$Timepoint == unique(tmp$Timepoint)[b]),]
    tmp_time$permu_label<- tmp_time$phenotype[shuffle(tmp_time$phenotype)]
    tmp_time_all <- rbind(tmp_time_all, tmp_time)
    
  }
  
  ptab_all <-tmp_time_all
  
  for (i in 2:length(unique(ptab$Family))){
  tmp <- ptab[which(ptab$Family == unique(ptab$Family)[i]),]
  tmp_time <- tmp[which(tmp$Timepoint == unique(tmp$Timepoint)[1]),]
  tmp_time$permu_label<- tmp_time$phenotype[shuffle(tmp_time$phenotype)]
  tmp_time_all <- tmp_time
  
    for (b in 2:3) {
    tmp_time <- tmp[which(tmp$Timepoint == unique(tmp$Timepoint)[b]),]
    tmp_time$permu_label<- tmp_time$phenotype[shuffle(tmp_time$phenotype)]
    tmp_time_all <- rbind(tmp_time_all, tmp_time)
    
    }
  
  ptab_all <- rbind(ptab_all, tmp_time_all)
  }
  

  mean(ptab_all$Distances_btwnTimepoints[which(ptab_all$permu_label == "A")]-
  ptab_all$Distances_btwnTimepoints[which(ptab_all$permu_label == "N")])

}

permu_means_by_fam_shuffled_while_maintaining_tp<-replicate(1000, permutation_meandist_gen_by_fam_diff(finaltab3))
mean_difference<-mean(finaltab3$Distances_btwnTimepoints[which(finaltab3$phenotype == "A")] -finaltab3$Distances_btwnTimepoints[which(finaltab3$phenotype == "N")])
sig <- as.numeric(sum(permu_means_by_fam_shuffled_while_maintaining_tp >= mean_difference))
pval_by_fam_all<-sig/1000

{plot(density(permu_means_by_fam_shuffled_while_maintaining_tp))
  abline(v = mean_difference, col = "red")}

7.4 Without NA points

# function to plot PCoA without NA points
wo_na_pcoa <- function(ps, pvar, ord_res){
  # ord_res: ordinated object
  
  keepid=!is.na(get_variable(ps, pvar)) &
    get_variable(ps, pvar)!='NA' &
    get_variable(ps, pvar)!=''
  tmp_ps <- prune_samples(keepid, ps)
  
  # get subset counts and metadata together
  DF <- cbind(ord_res$vectors[row.names(sample_data(tmp_ps)), 1:2], sample_data(tmp_ps)[, pvar])
  setnames(DF, pvar, 'testvar')
  
  # get eigenvalues
  eig=(ord_res$values$Eigenvalues/sum(ord_res$values$Eigenvalues))[1:2]*100
  
  p <- ggplot(data=DF, aes(x=Axis.1, y=Axis.2, colour=testvar))+
    geom_point(size=2)+
    ggtitle(pvar)+
    xlab(paste0('Axis.1 [', format(eig[1], digits=3), '%]'))+
    ylab(paste0('Axis.2 [', format(eig[2], digits=3), '%]'))+
    theme_minimal()+
    theme(legend.title=element_blank(), legend.position="bottom")
    
  print(p)
}

7.4.1 Looking into confounding variables

#Hard to find a confounding variable in impfactors that does not have a lot of NAs (no NAs required for DESEQ2) will put through metagenomeseq

#Function Updated with Altered formula for confound var
runDESeq_time_confound <- function(ps, dcut, confound){
  diagdds = phyloseq_to_deseq2(ps, as.formula(paste0("~ ", confound, "+ Within.study.sampling.date"))) 
  diagdds <- estimateSizeFactors(diagdds, type = "poscounts")
  diagdds <- DESeq(diagdds,fitType="parametric", betaPrior = FALSE) 
  #resultsNames(diagdds): to determine the constrast
  res = results(diagdds, contrast = c(confound, levels(map[,confound])[1], levels(map[,confound])[2]))
  res$padj[is.na(res$padj)] = 1
  sig <- res[res$padj < dcut,]
  if (dim(sig)[1] == 0) 
  {sigtab<- as.data.frame(1, row.names="nothing")
  colnames(sigtab) <- 'padj'}
  else 
  {
    sigtab <- data.frame(sig)
  }
  return(list(res, sigtab))
}

#Function Updated with Altered formula for confound var
run_metagenom_seq_confound<-function(ps,maxit, mcut, confound){
  p_metag<-phyloseq_to_metagenomeSeq(ps)
  #filtering at least 4 samples 
  p_metag= cumNorm(p_metag, p=0.75)
  normFactor =normFactors(p_metag)
  normFactor =log2(normFactor/median(normFactor) + 1)
  #mod = model.matrix(~ASDorNeuroT +PairASD+ normFactor)
  mod = model.matrix(as.formula(paste0("~ ", confound, "+ Within.study.sampling.date +normFactor")), data = pData(p_metag))
  settings =zigControl(maxit =maxit, verbose =FALSE)
  #settings =zigControl(tol = 1e-5, maxit = 30, verbose = TRUE, pvalMethod = 'bootstrap')
  fit =fitZig(obj = p_metag, mod = mod, useCSSoffset = FALSE, control = settings)
  #Note: changed fit$taxa to fit@taxa in light of error (probably from newer metagenomeseq ver.)
  res_fit<-MRtable(fit, number = length(fit@taxa))
  res_fit_nonfiltered <- copy(res_fit)
  res_fit<-res_fit[res_fit$adjPvalues<mcut,]
  #finally remove the ones that are not with enough samples
  #mean_sample<-mean(calculateEffectiveSamples(fit))
  #res_fit<-res_fit[res_fit$`counts in group 0` & res_fit$`counts in group 1` > mean_sample,]
  Min_effec_samp<-calculateEffectiveSamples(fit)
  Min_effec_samp<-Min_effec_samp[ names(Min_effec_samp)  %in% rownames(res_fit)] #####there is a bug here 
  #manually removing the ones with "NA"
  res_fit<-res_fit[grep("NA",rownames(res_fit), inv=T),]
  res_fit$Min_sample<-Min_effec_samp
  res_fit<-res_fit[res_fit$`+samples in group 0` >= Min_effec_samp & res_fit$`+samples in group 1` >= Min_effec_samp,]
  return(list(res_fit_nonfiltered, res_fit))
}

cat_confounds<-permanova_res$Variable[permanova_res$FactorClass == "categorical"]
num_confounds<-permanova_res$Variable[permanova_res$FactorClass == "numeric"]

#Remove var with more than 40 NAs
toomanyNAs<-list()
for (i in 1:length(cat_confounds)){
tmp<-is.na(filtered_ps003@sam_data[,cat_confounds[i]])
if (length(rownames(tmp)[which(tmp == TRUE)]) >=40) {
  toomanyNAs[i] <-cat_confounds[i]
}
}
cat_confounds<-cat_confounds[-which(cat_confounds %in% unlist(toomanyNAs))]

confound <- cat_confounds[1]
#some are listed as logical
write.csv(sample_data(filtered_ps003), "sam_data.csv")
map<-read.csv("sam_data.csv")
map[,confound] <- as.factor(map[,confound])
rownames(map) <- map$Biospecimen.Barcode
sample_data(filtered_ps003) <- map
filtered_ps003NAout<-prune_samples(!is.na(map[,confound]), filtered_ps003)
if(levels(map[,confound]) == 2){
deseqcon<-runDESeq_time_confound(filtered_ps003NAout, deseq_cut, confound = confound)
mtgcon<-run_metagenom_seq_confound(filtered_ps003NAout, 30, mtgseq_cut, confound = confound)
affected_taxa <-c(rownames(mtgcon[[2]]), row.names(deseqcon[2][[1]]) )
} else{
  mtgcon<-run_metagenom_seq_confound(filtered_ps003NAout, 30, mtgseq_cut, confound = confound)
  affected_taxa <-rownames(mtgcon[[2]])
}


#Run Deseq2 or Metagenomeseq on categorical confounds

for (i in 2:length(cat_confounds)) {
  confound <- cat_confounds[i]
  #some are listed as logical
  write.csv(sample_data(filtered_ps003), "sam_data.csv")
  map<-read.csv("sam_data.csv")
  map[,confound] <- as.factor(map[,confound])
  rownames(map) <- map$Biospecimen.Barcode
  sample_data(filtered_ps003) <- map
  filtered_ps003NAout<- prune_samples(!is.na(map[,confound]), filtered_ps003)
  if(length(levels(map[,confound])) == 2){
  deseqcon<-runDESeq_time_confound(filtered_ps003NAout, deseq_cut, confound = confound)
  mtgcon<-run_metagenom_seq_confound(filtered_ps003NAout, 30, mtgseq_cut, confound = confound)
  tmp <-c(rownames(mtgcon[[2]]), row.names(deseqcon[2][[1]]) )
  } else{
  mtgcon<-run_metagenom_seq_confound(filtered_ps003NAout, 30, mtgseq_cut, confound = confound)
  tmp <-rownames(mtgcon[[2]])
  }
  affected_taxa<-c(affected_taxa, tmp)
}

affected_taxa_cat<-unique(affected_taxa)




#Testing to see if numerical variables differ in means using wilcox or t-tests
tmp<-shapiro.test(map[,num_confounds[1]])
  if (tmp$p.value <= 0.05) {
    res<-wilcox.test(as.formula(paste(num_confounds[1], "~ phenotype")), data=map, var.equal = FALSE)
    tab<-tibble(num_confounds[1], res$p.value, "wilcox")
    colnames(tab) <- c("Var", "p.value", "Type")
    numerical_test_btwn_pheno <- tab
  }else{
    res<-t.test(as.formula(paste(num_confounds[1], "~ phenotype")), data=map)
    tab<-tibble(num_confounds[1], res$p.value, "t.test")
    colnames(tab) <- c("Var", "p.value", "Type")
    numerical_test_btwn_pheno <- tab
  }


for (i in num_confounds[c(-1)]) {
  tmp<-shapiro.test(map[,i])
  if (tmp$p.value <= 0.05) {
    res<-wilcox.test(as.formula(paste(i, "~ phenotype")), data=map, var.equal = FALSE)
    tab<-tibble(i, res$p.value, "wilcox")
    colnames(tab) <- c("Var", "p.value", "Type")
    numerical_test_btwn_pheno <-rbind(numerical_test_btwn_pheno, tab)
  }else{
    res<-t.test(as.formula(paste(i, "~ phenotype")), data=map)
    tab<-tibble(i, res$p.value, "t.test")
    colnames(tab) <- c("Var", "p.value", "Type")
    numerical_test_btwn_pheno <-rbind(numerical_test_btwn_pheno, tab)
  }
}


numerical_test_btwn_pheno$p.value<-p.adjust(numerical_test_btwn_pheno$p.value)
sig_numvar<-numerical_test_btwn_pheno[which(numerical_test_btwn_pheno$p.value <= 0.05),]

num_confounds2 <- sig_numvar$Var






#Spearman test between taxa and possible confounding variables that are ordinal
confound <- num_confounds2[1]  
otus<-as.data.frame(otu_table(ps_DeSeq_norm_pass_min_postDD_sup003))
otus <- t(otus)
otus <- as.data.frame(otus)

otus$confound <- map[,confound]
form <-as.formula(paste("~", colnames(otus)[1], "+", "confound"))
tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

speartesttab <- tibble(colnames(otus)[1], tmp$p.value)
colnames(speartesttab) <- c("otu", "p_val")

for (i in 2:(length(colnames(otus))-1)){
    otus$confound <- map[,confound]
    form <-as.formula(paste("~", colnames(otus)[i], "+", "confound"))
    tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

    tab <- tibble(colnames(otus)[i], tmp$p.value)
    colnames(tab) <- c("otu", "p_val")
    speartesttab <-rbind(speartesttab, tab)
}

spear_p.val<-p.adjust(speartesttab$p_val)
spear_sig_asvs_p.val<-speartesttab$otu[spear_p.val < 0.05]

for (x in 2:length(num_confounds2)){
  confound <- num_confounds2[x]  
  otus<-as.data.frame(otu_table(ps_DeSeq_norm_pass_min_postDD_sup003))
  otus <- t(otus)
  otus <- as.data.frame(otus)

  otus$confound <- map[,confound]
  form <-as.formula(paste("~", colnames(otus)[1], "+", "confound"))
  tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

  speartesttab <- tibble(colnames(otus)[1], tmp$p.value)
  colnames(speartesttab) <- c("otu", "p_val")

  for (i in 2:(length(colnames(otus))-1)){
    otus$confound <- map[,confound]
    form <-as.formula(paste("~", colnames(otus)[i], "+", "confound"))
    tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

    tab <- tibble(colnames(otus)[i], tmp$p.value)
    colnames(tab) <- c("otu", "p_val")
    speartesttab <-rbind(speartesttab, tab)
}

  spear_p.val<-p.adjust(speartesttab$p_val)
  spear_sig_asvs_p.val<-c(spear_sig_asvs_p.val, speartesttab$otu[spear_p.val < 0.05])

}

#generate full list of significant ones with confounds
affected_num_list <- unique(spear_sig_asvs_p.val)

full_confound_asv_list <-c(affected_num_list, affected_taxa_cat)

full_confound_asv_list<-unique(full_confound_asv_list)


#Filter them out from main list and save
full_sigtab_esv_confoundfiltered<-fullsigtab_esv[-which(rownames(fullsigtab_esv) %in% full_confound_asv_list), ]

write.csv(full_sigtab_esv_confoundfiltered, "Full_sig_asvs_w_confounding_var_asvs_filtered_out.csv")

confounding_var_list <-c(num_confounds2, cat_confounds)
saveRDS(confounding_var_list, "confounding_var_list.rds")


full_sigtab_esv_confoundfiltered.print <- full_sigtab_esv_confoundfiltered
rownames(full_sigtab_esv_confoundfiltered.print) <- NULL
full_sigtab_esv_confoundfiltered.print$ASV <- NULL

full_sigtab_esv_confoundfiltered.print
##      Method+Data      Domain              Phylum             Class
## 1         Mtg_P1 d__Bacteria p__Actinobacteriota c__Actinobacteria
## 2         Mtg_P1 d__Bacteria       p__Firmicutes        c__Bacilli
## 3         Mtg_P2 d__Bacteria       p__Firmicutes        c__Bacilli
## 4         Mtg_P2 d__Bacteria     p__unclassified   c__unclassified
## 5         Mtg_P3 d__Bacteria     p__Firmicutes_A     c__Clostridia
## 6         Mtg_P3 d__Bacteria     p__Firmicutes_A     c__Clostridia
## 7         Mtg_P3 d__Bacteria       p__Firmicutes        c__Bacilli
## 8         Mtg_P3 d__Bacteria     p__Firmicutes_A     c__Clostridia
## 9         Mtg_P3 d__Bacteria     p__Firmicutes_A     c__Clostridia
## 10       Mtg_All d__Bacteria     p__Firmicutes_A     c__Clostridia
## 11       Mtg_All d__Bacteria     p__Firmicutes_A     c__Clostridia
## 12       Mtg_All d__Bacteria     p__Bacteroidota    c__Bacteroidia
## 13 Mtg_P1Mtg_All d__Bacteria     p__Firmicutes_A     c__Clostridia
## 14 Mtg_P1Mtg_All d__Bacteria     p__Firmicutes_A     c__Clostridia
## 15 Mtg_P3Mtg_All d__Bacteria     p__Firmicutes_A     c__Clostridia
## 16 Mtg_P2Mtg_All d__Bacteria     p__Firmicutes_A     c__Clostridia
## 17 Mtg_P3Mtg_All d__Bacteria     p__Firmicutes_C  c__Negativicutes
## 18 Mtg_P3Mtg_All d__Bacteria     p__Bacteroidota    c__Bacteroidia
## 19     DESeq2_P1 d__Bacteria     p__Firmicutes_A     c__Clostridia
## 20     DESeq2_P1 d__Bacteria     p__Firmicutes_A     c__Clostridia
## 21     DESeq2_P1 d__Bacteria     p__Firmicutes_C  c__Negativicutes
## 22     DESeq2_P3 d__Bacteria     p__Bacteroidota    c__Bacteroidia
## 23     DESeq2_P3 d__Bacteria       p__Firmicutes        c__Bacilli
## 24  DESeq2_P1+P3 d__Bacteria     p__Bacteroidota    c__Bacteroidia
##                    Order                       Family                     Genus
## 1     o__Actinomycetales        f__Bifidobacteriaceae        g__Bifidobacterium
## 2     o__Lactobacillales          f__Streptococcaceae          g__Streptococcus
## 3  o__Erysipelotrichales       f__Erysipelotrichaceae               g__Absiella
## 4        o__unclassified              f__unclassified           g__unclassified
## 5      o__Lachnospirales           f__Lachnospiraceae          g__Coprococcus_A
## 6      o__Lachnospirales           f__Lachnospiraceae           g__unclassified
## 7  o__Erysipelotrichales f__Erysipelatoclostridiaceae g__Erysipelatoclostridium
## 8      o__Lachnospirales           f__Lachnospiraceae         g__PROV_t__256727
## 9     o__Oscillospirales           f__Ruminococcaceae           g__unclassified
## 10    o__Oscillospirales        f__Acutalibacteraceae           g__unclassified
## 11       o__unclassified              f__unclassified           g__unclassified
## 12      o__Bacteroidales            f__Bacteroidaceae            g__Bacteroides
## 13     o__Lachnospirales           f__Lachnospiraceae              g__Blautia_A
## 14     o__Lachnospirales           f__Lachnospiraceae         g__PROV_t__182732
## 15     o__Lachnospirales           f__Lachnospiraceae           g__unclassified
## 16     o__Lachnospirales           f__Lachnospiraceae          g__Eubacterium_E
## 17     o__Veillonellales           f__Veillonellaceae            g__Veillonella
## 18      o__Bacteroidales            f__Marinifilaceae            g__Odoribacter
## 19    o__Oscillospirales           f__Ruminococcaceae       g__Faecalibacterium
## 20       o__unclassified              f__unclassified           g__unclassified
## 21     o__Veillonellales            f__Dialisteraceae              g__Dialister
## 22      o__Bacteroidales            f__Bacteroidaceae            g__Bacteroides
## 23    o__Lactobacillales          f__Streptococcaceae            g__Lactococcus
## 24      o__Bacteroidales           f__Barnesiellaceae            g__Barnesiella
##                             Species          Strain
## 1       s__Bifidobacterium__bifidum t__unclassified
## 2                   s__unclassified t__unclassified
## 3                   s__unclassified t__unclassified
## 4                   s__unclassified t__unclassified
## 5           s__Coprococcus_A__catus        t__92557
## 6                   s__unclassified t__unclassified
## 7                   s__unclassified t__unclassified
## 8                 s__PROV_t__256727       t__256727
## 9                   s__unclassified t__unclassified
## 10                  s__unclassified t__unclassified
## 11                  s__unclassified t__unclassified
## 12                  s__unclassified t__unclassified
## 13                  s__unclassified t__unclassified
## 14                  s__unclassified t__unclassified
## 15                  s__unclassified t__unclassified
## 16       s__Eubacterium_E__hallii_A t__unclassified
## 17                  s__unclassified t__unclassified
## 18     s__Odoribacter__splanchnicus t__unclassified
## 19                  s__unclassified t__unclassified
## 20                  s__unclassified t__unclassified
## 21                  s__PROV_t__4989         t__4989
## 22     s__Bacteroides__intestinalis t__unclassified
## 23                  s__unclassified t__unclassified
## 24 s__Barnesiella__intestinihominis        t__21316

7.4.2 Digital phenotype

sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Mobile.Autism.Risk.Assessment.Score <- as.numeric(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Mobile.Autism.Risk.Assessment.Score)


#Spearman test for taxa correlated with MARA with Deseq
ps_DeSeq_norm_pass_min_postDD_sup003_A <- subset_samples(ps_DeSeq_norm_pass_min_postDD_sup003, phenotype == "A")

otus<-as.data.frame(otu_table(ps_DeSeq_norm_pass_min_postDD_sup003_A))
otus <- t(otus)
otus <- as.data.frame(otus)

mapa <- map[which(map$phenotype == "A"),]
otus$MARA <- mapa[,"Mobile.Autism.Risk.Assessment.Score"]
otus$ID <- mapa$Biospecimen.Barcode
form <-as.formula(paste("~", colnames(otus)[1], "+", "MARA"))
tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

speartesttab <- tibble(colnames(otus)[1], tmp$p.value)
colnames(speartesttab) <- c("otu", "p_val")

for (i in 2:(length(colnames(otus))-2)){
    otus$MARA <- mapa[,"Mobile.Autism.Risk.Assessment.Score"]
    form <-as.formula(paste("~", colnames(otus)[i], "+", "MARA"))
    tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

    tab <- tibble(colnames(otus)[i], tmp$p.value)
    colnames(tab) <- c("otu", "p_val")
    speartesttab <-rbind(speartesttab, tab)
}

spear_p.val<-p.adjust(speartesttab$p_val)

spear_sig_asvs_p.val_des_all_MARA<-speartesttab$otu[spear_p.val < 0.05]



#Spearman test for taxa correlated with MARA with CSS
ps_CSS_norm_pass_min_postDD_sup003_A <- subset_samples(ps_CSS_norm_pass_min_postDD_sup003, phenotype == "A")

otus<-as.data.frame(otu_table(ps_CSS_norm_pass_min_postDD_sup003_A))
otus <- t(otus)
otus <- as.data.frame(otus)

mapa <- map[which(map$phenotype == "A"),]
otus$MARA <- mapa[,"Mobile.Autism.Risk.Assessment.Score"]
otus$ID <- mapa$Biospecimen.Barcode
form <-as.formula(paste("~", colnames(otus)[1], "+", "MARA"))
tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

speartesttab <- tibble(colnames(otus)[1], tmp$p.value)
colnames(speartesttab) <- c("otu", "p_val")

for (i in 2:(length(colnames(otus))-2)){
    otus$MARA <- mapa[,"Mobile.Autism.Risk.Assessment.Score"]
    form <-as.formula(paste("~", colnames(otus)[i], "+", "MARA"))
    tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

    tab <- tibble(colnames(otus)[i], tmp$p.value)
    colnames(tab) <- c("otu", "p_val")
    speartesttab <-rbind(speartesttab, tab)
}

spear_p.val<-p.adjust(speartesttab$p_val)

spear_sig_asvs_p.val_des_all_MARA_CSS<-speartesttab$otu[spear_p.val < 0.05]

#Same asvs, but not found in fullsigtab_esv



#Trying by timepoint

#Timepoint 1

P1_des<-prune_samples(rownames(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003_A))[sample_data(ps_DeSeq_norm_pass_min_postDD_sup003_A)$Within.study.sampling.date == "Timepoint 1"], ps_DeSeq_norm_pass_min_postDD_sup003_A)


otus<-as.data.frame(otu_table(P1_des))
otus <- t(otus)
otus <- as.data.frame(otus)

otus$MARA <- P1_des@sam_data$Mobile.Autism.Risk.Assessment.Score
form <-as.formula(paste("~", colnames(otus)[1], "+", "MARA"))
tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

speartesttab <- tibble(colnames(otus)[1], tmp$p.value)
colnames(speartesttab) <- c("otu", "p_val")

for (i in 2:(length(colnames(otus))-1)){
    otus$MARA <- P1_des@sam_data$Mobile.Autism.Risk.Assessment.Score
    form <-as.formula(paste("~", colnames(otus)[i], "+", "MARA"))
    tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

    tab <- tibble(colnames(otus)[i], tmp$p.value)
    colnames(tab) <- c("otu", "p_val")
    speartesttab <-rbind(speartesttab, tab)
}

spear_p.val<-p.adjust(speartesttab$p_val)

spear_sig_asvs_p.val_des_1_MARA<-speartesttab$otu[spear_p.val < 0.05]



#Timepoint 2

P2_des<-prune_samples(rownames(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003))[sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Within.study.sampling.date == "Timepoint 2"], ps_DeSeq_norm_pass_min_postDD_sup003)



otus<-as.data.frame(otu_table(P2_des))
otus <- t(otus)
otus <- as.data.frame(otus)

otus$MARA <- P2_des@sam_data$Mobile.Autism.Risk.Assessment.Score
form <-as.formula(paste("~", colnames(otus)[1], "+", "MARA"))
tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

speartesttab <- tibble(colnames(otus)[1], tmp$p.value)
colnames(speartesttab) <- c("otu", "p_val")

for (i in 2:(length(colnames(otus))-1)){
    otus$MARA <- P2_des@sam_data$Mobile.Autism.Risk.Assessment.Score
    form <-as.formula(paste("~", colnames(otus)[i], "+", "MARA"))
    tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

    tab <- tibble(colnames(otus)[i], tmp$p.value)
    colnames(tab) <- c("otu", "p_val")
    speartesttab <-rbind(speartesttab, tab)
}

spear_p.val<-p.adjust(speartesttab$p_val)

spear_sig_asvs_p.val_des_2_MARA<-speartesttab$otu[spear_p.val < 0.05]


#Timepoint 3

P3_des<-prune_samples(rownames(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003))[sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Within.study.sampling.date == "Timepoint 3"], ps_DeSeq_norm_pass_min_postDD_sup003)

otus<-as.data.frame(otu_table(P3_des))
otus <- t(otus)
otus <- as.data.frame(otus)

otus$MARA <- P3_des@sam_data$Mobile.Autism.Risk.Assessment.Score
form <-as.formula(paste("~", colnames(otus)[1], "+", "MARA"))
tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

speartesttab <- tibble(colnames(otus)[1], tmp$p.value)
colnames(speartesttab) <- c("otu", "p_val")

for (i in 2:(length(colnames(otus))-1)){
    otus$MARA <- P3_des@sam_data$Mobile.Autism.Risk.Assessment.Score
    form <-as.formula(paste("~", colnames(otus)[i], "+", "MARA"))
    tmp<-cor.test(formula = form, data = otus, method = "spearman",
         continuity = FALSE,
         conf.level = 0.95, exact = FALSE)

    tab <- tibble(colnames(otus)[i], tmp$p.value)
    colnames(tab) <- c("otu", "p_val")
    speartesttab <-rbind(speartesttab, tab)
}

spear_p.val<-p.adjust(speartesttab$p_val)

spear_sig_asvs_p.val_des_3_MARA<-speartesttab$otu[spear_p.val < 0.05]




wo_na_pcoa(ps_DeSeq_norm_pass_min_postDD_sup003, 'Mobile.Autism.Risk.Assessment.Score', GP.ord)

7.4.3 Probiotics

wo_na_pcoa(ps_DeSeq_norm_pass_min_postDD_sup003, 'Probiotic..consumption.frequency.', GP.ord)

7.4.4 Antibiotics

# Anti.infective
wo_na_pcoa(ps_DeSeq_norm_pass_min_postDD_sup003, 'Anti.infective', GP.ord)

# Minimum.time.since.antibiotics
sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Minimum.time.since.antibiotics <- as.numeric(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Minimum.time.since.antibiotics)
wo_na_pcoa(ps_DeSeq_norm_pass_min_postDD_sup003, 'Minimum.time.since.antibiotics', GP.ord)

7.4.5 All other passed R2 cutoff and significant

for(pvar in permanova_res[R2>permanova_cut & pvalue<permanova_pcut]$Variable){
wo_na_pcoa(ps_DeSeq_norm_pass_min_postDD_sup003, pvar, GP.ord)
}

7.4.6 Random Forest

#Random Forest main function
rand_forest <- function(pred_sequences, ps){ 

  
  ps <-prune_taxa(pred_sequences, ps )
  phen <- sample_data(ps)$phenotype
  family <- unique(sample_data(ps)$Family.group.ID)
  fam_id <- sample_data(ps)$Family.group.ID
  
  data<-t(otu_table(ps))
  data <- data.frame(phen, data, fam_id)

  set.seed(103)
  folds_by_family<-groupKFold(fam_id, 10)
  
  validate <- data[-folds_by_family[[1]],]
  training <- data[folds_by_family[[1]],]


  validate$fam_id <- NULL
  training$fam_id <- NULL
  
  set.seed(1)
  control <- trainControl(method='repeatedcv', 
                        number=3, 
                        repeats=3)

  tunegrid <- expand.grid(.mtry=c(3:20)) #mtry is the depth of each decision tree
  rf <- train(phen ~., 
            data= training, 
            method='rf', 
            metric='Accuracy', 
            tuneGrid=tunegrid, 
            trControl=control)


  mtry_best = as.numeric(rf$bestTune)

  set.seed(1)
  AR.classify <- randomForest(phen~., data = training, ntree = 128, mtry = mtry_best, importance = TRUE)
  rf<-AR.classify
  OOB.votes <- predict(rf,validate[,-1],type="prob");
  OOBpred_votes <- OOB.votes

  pred_votes <- OOBpred_votes[,2]


  for(i in 2:10){
  
  
    validate <- data[-folds_by_family[[i]],]
    training <- data[folds_by_family[[i]],]

    validate$fam_id <- NULL
    training$fam_id <- NULL
  
  
    set.seed(1)
    control <- trainControl(method='repeatedcv', 
                        number=3, 
                        repeats=3)

    tunegrid <- expand.grid(.mtry=c(3:20)) #mtry is the depth of each decision tree
    rf <- train(phen ~., 
            data= training, 
            method='rf', 
            metric='Accuracy', 
            tuneGrid=tunegrid, 
            trControl=control)
  
    mtry_best = as.numeric(rf$bestTune)
  
    set.seed(1)
    AR.classify <- randomForest(phen~., data = training, ntree = 128, mty = mtry_best, importance = TRUE)
    rf<-AR.classify
    OOB.votes <- predict (rf,validate[,-1],type="prob");
  
    OOB.votes
    pred_votes<-append(pred_votes, OOB.votes[,2])
  
  
  }
  a<-tibble(names(pred_votes), pred_votes)
  b <- a[order(a$`names(pred_votes)`), ]
  data<- data[order(rownames(data)),]


  pred.obj <- prediction(b$pred_votes,data$phen);
  perf_AUC=performance(pred.obj,"auc") #Calculate the AUC value
  AUCagp1=perf_AUC@y.values[[1]]
  AUCagp1
  RP.perf <- performance(pred.obj, "prec","rec");
  ROC.perfAGP <- performance(pred.obj, "tpr","fpr");
  outputlist <-list()
  outputlist[1] <- AUCagp1
  outputlist[2] <- ROC.perfAGP
  return(outputlist)

}

#For 24 (set of asvs with confounds filtered out)
ROC.perf_24 <-rand_forest(pred_sequences = rownames(full_sigtab_esv_confoundfiltered), ps = ps_DeSeq_norm_pass_min_postDD_sup003)

#For 70 (unfiltered results)
ROC.perf_70 <-rand_forest(rownames(fullsigtab_esv), ps_DeSeq_norm_pass_min_postDD_sup003)

#Combine AUCs
AUC_all <- c(ROC.perf_24[1], ROC.perf_70[1])

#Plot
{plot(ROC.perf_24[[2]], main = "10-cross validation", col = rainbow(8)[1])
plot(ROC.perf_70[[2]], main = "10-cross validation", col = rainbow(8)[2], add = TRUE)
lines(c(0,1), c(0,1), col = "black")
legend(title = "AUC value", .8, .33, legend=round(as.numeric(AUC_all), digits = 4),
       col=colors,cex=1.0)

predictors_name <- c("24 Diff. Abundant ASVs", "70 Diff. Abundant ASVs")
legend(title = "Predictors",.375, .33, legend=predictors_name,
       col=c(rainbow(8)[c(1,2,3,5,7,6,8)], hcl.colors(1, palette = "viridis")),cex=1.0, lty=1:7)
}

rand_forest_time <- function(pred_sequences, ps){ 
  
  ps <-prune_taxa(pred_sequences, ps )
  phen <- sample_data(ps)$phenotype
  data<-t(otu_table(ps))
  data <- data.frame(phen, data)
  set.seed(103)
  folds<-createFolds(data$phen, k=10)
  validate <- data[folds[[1]],]
  training <- data[-folds[[1]],]
  set.seed(1)
  control <- trainControl(method='repeatedcv', 
                        number=3, 
                        repeats=3)
  tunegrid <- expand.grid(.mtry=c(3:20)) #mtry is the depth of each decision tree
  rf <- train(phen ~., 
            data= training, 
            method='rf', 
            metric='Accuracy', 
            tuneGrid=tunegrid, 
            trControl=control)
  mtry_best = as.numeric(rf$bestTune)
  set.seed(1)
  AR.classify <- randomForest(phen~., data = training, ntree = 128, mtry = mtry_best, importance = TRUE)
  rf<-AR.classify
  OOB.votes <- predict(rf,validate[,-1],type="prob");
  OOBpred_votes <- OOB.votes
  pred_votes <- OOBpred_votes[,2]
  for(i in 2:10){
  
  
    validate <- data[folds[[i]],]
    training <- data[-folds[[i]],]
  
  
    set.seed(1)
    control <- trainControl(method='repeatedcv', 
                        number=3, 
                        repeats=3)
    tunegrid <- expand.grid(.mtry=c(3:20)) #mtry is the depth of each decision tree
    rf <- train(phen ~., 
            data= training, 
            method='rf', 
            metric='Accuracy', 
            tuneGrid=tunegrid, 
            trControl=control)
  
    mtry_best = as.numeric(rf$bestTune)
  
    set.seed(1)
    AR.classify <- randomForest(phen~., data = training, ntree = 128, mty = mtry_best, importance = TRUE)
    rf<-AR.classify
    OOB.votes <- predict (rf,validate[,-1],type="prob");
  
    OOB.votes
    pred_votes<-append(pred_votes, OOB.votes[,2])
  
  
  }
  a<-tibble(names(pred_votes), pred_votes)
  b <- a[order(a$`names(pred_votes)`), ]
  data<- data[order(rownames(data)),]
  pred.obj <- prediction(b$pred_votes,data$phen);
  perf_AUC=performance(pred.obj,"auc") #Calculate the AUC value
  AUCagp1=perf_AUC@y.values[[1]]
  AUCagp1
  RP.perf <- performance(pred.obj, "prec","rec");
  ROC.perfAGP <- performance(pred.obj, "tpr","fpr");
  outputlist <-list()
  outputlist[1] <- AUCagp1
  outputlist[2] <- ROC.perfAGP
  return(outputlist)
}
#Per timepoints

P1_des<-prune_samples(rownames(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003))[sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Within.study.sampling.date == "Timepoint 1"], ps_DeSeq_norm_pass_min_postDD_sup003)
P2_des<-prune_samples(rownames(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003))[sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Within.study.sampling.date == "Timepoint 2"], ps_DeSeq_norm_pass_min_postDD_sup003)
P3_des<-prune_samples(rownames(sample_data(ps_DeSeq_norm_pass_min_postDD_sup003))[sample_data(ps_DeSeq_norm_pass_min_postDD_sup003)$Within.study.sampling.date == "Timepoint 3"], ps_DeSeq_norm_pass_min_postDD_sup003)


ROC.perf_24_P1 <-rand_forest_time(pred_sequences = rownames(full_sigtab_esv_confoundfiltered), ps = P1_des)
ROC.perf_24_P2 <-rand_forest_time(pred_sequences = rownames(full_sigtab_esv_confoundfiltered), ps = P2_des)
ROC.perf_24_P3 <-rand_forest_time(pred_sequences = rownames(full_sigtab_esv_confoundfiltered), ps = P3_des)
AUC_all <- c(ROC.perf_24_P1[1], ROC.perf_24_P2[1], ROC.perf_24_P3[1])


{plot(ROC.perf_24_P1[[2]], main = "10-cross validation for 24 Sequences by Timepoint", col = rainbow(8)[1])
plot(ROC.perf_24_P2[[2]], main = "10-cross validation", col = rainbow(8)[2], add = TRUE)
plot(ROC.perf_24_P2[[2]], main = "10-cross validation", col = rainbow(8)[3], add = TRUE)
lines(c(0,1), c(0,1), col = "black")
legend(title = "AUC value", .8, .33, legend=round(as.numeric(AUC_all), digits = 4),
       col=colors,cex=1.0)

predictors_name <- c("24 Diff. Abundant ASVs P1", "24 Diff. Abundant ASVs P2","24 Diff. Abundant ASVs P3")
legend(title = "Predictors",.375, .33, legend=predictors_name,
       col=c(rainbow(8)[c(1,2,3,5,7,6,8)], hcl.colors(1, palette = "viridis")),cex=1.0, lty=1:7)
}

ROC.perf_70_P1 <-rand_forest_time(pred_sequences = rownames(fullsigtab_esv), ps = P1_des)
ROC.perf_70_P2 <-rand_forest_time(pred_sequences = rownames(fullsigtab_esv), ps = P2_des)
ROC.perf_70_P3 <-rand_forest_time(pred_sequences = rownames(fullsigtab_esv), ps = P3_des)
AUC_all <- c(ROC.perf_70_P1[1], ROC.perf_70_P2[1], ROC.perf_70_P3[1])


{plot(ROC.perf_70_P1[[2]], main = "10-cross validation for 70 Sequences by Timepoint", col = rainbow(8)[1])
plot(ROC.perf_70_P2[[2]], main = "10-cross validation", col = rainbow(8)[2], add = TRUE)
plot(ROC.perf_70_P2[[2]], main = "10-cross validation", col = rainbow(8)[3], add = TRUE)
lines(c(0,1), c(0,1), col = "black")
legend(title = "AUC value", .8, .33, legend=round(as.numeric(AUC_all), digits = 4),
       col=colors,cex=1.0)

predictors_name <- c("70 Diff. Abundant ASVs P1", "70 Diff. Abundant ASVs P2","70 Diff. Abundant ASVs P3")
legend(title = "Predictors",.375, .33, legend=predictors_name,
       col=c(rainbow(8)[c(1,2,3,5,7,6,8)], hcl.colors(1, palette = "viridis")),cex=1.0, lty=1:7)
}

8 Session information

sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS Linux 8 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] phyloseq_1.30.0             biomformat_1.14.0          
##  [3] DESeq2_1.26.0               SummarizedExperiment_1.16.1
##  [5] DelayedArray_0.12.3         BiocParallel_1.20.1        
##  [7] matrixStats_0.56.0          GenomicRanges_1.38.0       
##  [9] GenomeInfoDb_1.22.1         IRanges_2.20.2             
## [11] S4Vectors_0.24.4            metagenomeSeq_1.28.2       
## [13] RColorBrewer_1.1-2          glmnet_4.0-2               
## [15] Matrix_1.2-18               limma_3.42.2               
## [17] Biobase_2.46.0              BiocGenerics_0.32.0        
## [19] vegan_2.5-6                 permute_0.9-5              
## [21] ggpubr_0.4.0                compositions_2.0-0         
## [23] nlme_3.1-148                exactRankTests_0.8-31      
## [25] ROCR_1.0-11                 randomForest_4.6-14        
## [27] caret_6.0-86                lattice_0.20-41            
## [29] smatr_3.4-8                 adegraphics_1.0-15         
## [31] gridExtra_2.3               DT_0.14                    
## [33] pander_0.6.3                ggplot2_3.3.2              
## [35] dplyr_1.0.0                 reshape2_1.4.4             
## [37] tidyr_1.1.0                 knitr_1.29                 
## [39] devtools_2.3.1              usethis_1.6.1              
## [41] data.table_1.13.0           tibble_3.0.2               
## [43] shiny_1.5.0                
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_1.1.0       RSQLite_2.2.0          AnnotationDbi_1.48.0  
##   [4] htmlwidgets_1.5.1      grid_3.6.2             pROC_1.16.2           
##   [7] IHW_1.14.0             munsell_0.5.0          codetools_0.2-16      
##  [10] withr_2.2.0            colorspace_1.4-1       rstudioapi_0.11       
##  [13] robustbase_0.93-6      bayesm_3.1-4           ggsignif_0.6.0        
##  [16] labeling_0.3           slam_0.1-47            GenomeInfoDbData_1.2.2
##  [19] lpsymphony_1.16.0      farver_2.0.3           bit64_0.9-7           
##  [22] rhdf5_2.30.1           rprojroot_1.3-2        vctrs_0.3.1           
##  [25] generics_0.0.2         ipred_0.9-9            xfun_0.16             
##  [28] R6_2.4.1               locfit_1.5-9.4         bitops_1.0-6          
##  [31] assertthat_0.2.1       promises_1.1.1         scales_1.1.1          
##  [34] nnet_7.3-14            gtable_0.3.0           processx_3.4.3        
##  [37] timeDate_3043.102      rlang_0.4.6            genefilter_1.68.0     
##  [40] splines_3.6.2          rstatix_0.6.0          ModelMetrics_1.2.2.2  
##  [43] acepack_1.4.1          broom_0.7.0            checkmate_2.0.0       
##  [46] yaml_2.2.1             abind_1.4-5            crosstalk_1.1.0.1     
##  [49] backports_1.1.8        httpuv_1.5.4           Hmisc_4.4-0           
##  [52] tensorA_0.36.1         tools_3.6.2            lava_1.6.7            
##  [55] ellipsis_0.3.1         gplots_3.0.4           sessioninfo_1.1.1     
##  [58] Rcpp_1.0.5             plyr_1.8.6             base64enc_0.1-3       
##  [61] zlibbioc_1.32.0        purrr_0.3.4            RCurl_1.98-1.2        
##  [64] ps_1.3.3               prettyunits_1.1.1      rpart_4.1-15          
##  [67] Wrench_1.4.0           haven_2.3.1            cluster_2.1.0         
##  [70] fs_1.4.2               magrittr_1.5           openxlsx_4.1.5        
##  [73] pkgload_1.1.0          hms_0.5.3              mime_0.9              
##  [76] evaluate_0.14          xtable_1.8-4           XML_3.99-0.3          
##  [79] rio_0.5.16             jpeg_0.1-8.1           readxl_1.3.1          
##  [82] shape_1.4.4            testthat_2.3.2         compiler_3.6.2        
##  [85] KernSmooth_2.23-17     crayon_1.3.4           htmltools_0.5.0       
##  [88] mgcv_1.8-31            later_1.1.0.1          Formula_1.2-3         
##  [91] geneplotter_1.64.0     DBI_1.1.0              lubridate_1.7.9       
##  [94] MASS_7.3-51.6          ade4_1.7-15            car_3.0-8             
##  [97] cli_2.0.2              gdata_2.18.0           igraph_1.2.5          
## [100] gower_0.2.2            forcats_0.5.0          pkgconfig_2.0.3       
## [103] foreign_0.8-76         sp_1.4-2               recipes_0.1.13        
## [106] foreach_1.5.0          annotate_1.64.0        multtest_2.42.0       
## [109] XVector_0.26.0         prodlim_2019.11.13     stringr_1.4.0         
## [112] callr_3.4.3            digest_0.6.25          Biostrings_2.54.0     
## [115] rmarkdown_2.3          cellranger_1.1.0       htmlTable_2.0.1       
## [118] curl_4.3               gtools_3.8.2           jsonlite_1.7.0        
## [121] lifecycle_0.2.0        Rhdf5lib_1.8.0         carData_3.0-4         
## [124] desc_1.2.0             fansi_0.4.1            pillar_1.4.6          
## [127] fastmap_1.0.1          DEoptimR_1.0-8         pkgbuild_1.1.0        
## [130] survival_3.2-3         glue_1.4.1             remotes_2.2.0         
## [133] zip_2.0.4              fdrtool_1.2.15         png_0.1-7             
## [136] iterators_1.0.12       bit_1.1-15.2           class_7.3-17          
## [139] stringi_1.4.6          blob_1.2.1             latticeExtra_0.6-29   
## [142] caTools_1.18.0         memoise_1.1.0          e1071_1.7-3           
## [145] ape_5.4